通过库传递CUDA函数指针_Cuda_Linker_Nvcc

通过库传递CUDA函数指针

cuda linker

通过库传递CUDA函数指针,cuda,linker,nvcc,Cuda,Linker,Nvcc,我正在使用CUDA，并试图使用函数指针将CUDA函数传递给一个库，该库稍后在其设备内核中使用该函数，类似于CUDA函数指针示例守则的重要部分包括： /** Type definition for the execution function in #qsched_run. */ typedef void (*qsched_funtype)( int , void * ); __device__ void gpuTest(int type , void *data) { .... } __

我正在使用CUDA，并试图使用函数指针将CUDA函数传递给一个库，该库稍后在其设备内核中使用该函数，类似于CUDA函数指针示例

守则的重要部分包括：

/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );

__device__ void gpuTest(int type , void *data)
{
  ....
}
__device__ qsched_funtype function = gpuTest;

void main(...)
{
//Various initialization setup.

if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
    error("Failed to copy function pointer from device");

qsched_run_CUDA( &s , func );
}

qsched_run_CUDA函数是一个库函数，它执行一些初始化，将函数指针复制到设备（它可以看到的变量），然后运行内核，内核在某些时候使用该函数指针调用gpuTest函数

如果我在以下nvcc调用中使用-G，代码将正确编译：

nvcc -g -G -m64 -I../src ../src/.libs/libquicksched_cuda.a -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda -DWITH_CUDA -gencode arch=compute_30,code=sm_30 -lgomp test_gpu_simple.cu -o out.out

在哪里

是包含qsched_run_CUDA函数的库

当我从nvcc调用中删除-G标志的那一刻，它突然中断，在qsched_run_CUDA中运行的内核崩溃，出现无效程序计数器错误，函数指针（包括在我自己的.cu文件中）设置为0x4
大概我需要在CUDA（）中使用单独的编译，正如中模糊地解释的那样-但是我不确定在使用库函数时如何做到这一点，无论是nvcc指南还是stackoverflow链接都没有明确说明如何做到这一点

有人有这方面的经验吗？我试图简单地尝试使用nvlink来完成这项工作，但我没有走多远（我给它传递了一个库，它似乎不高兴）。
是的，您需要使用单独的编译。我根据您到目前为止所展示的内容，并使用文档中的。代码如下：
kernel_lib.cu：

#include <stdio.h> #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0) /** Type definition for the execution function in #qsched_run. */ typedef void (*qsched_funtype)( int , void * ); __global__ void mykernel(int type, void *data, void *func){ ((qsched_funtype)func)(type, data); } int qsched_run_CUDA(int val, void *d_data, void *func) { mykernel<<<1,1>>>(val, d_data, func); cudaDeviceSynchronize(); cudaCheckErrors("kernel fail"); return 0; }

我使用CUDA 5.0进行此测试。
如果添加
\uuuuuuNoInLine\uuuuuu
例如
\uuuuuuuuu设备\uuuuuuuuuuuuuuuNoInLine\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu无效gpuTest（…
），则会发生什么情况？一个完整的代码可以重现此问题。添加noinline没有任何区别。我将尝试创建一个小的测试版本（一个源文件编译为库，另一个文件使用该库编译）明天，将其添加到OP中，不幸的是，当前项目太大，无法直接粘贴到此处。好的，看起来我比以前更接近了！我在其他地方遇到了一些问题，希望检查我是否正确编译-将更新OP。我的一个nvcc调用中有一个输入错误，停止了外部C文件的正确链接-现在已修复，请删除预计起飞时间。
#include <stdio.h> #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0) /** Type definition for the execution function in #qsched_run. */ typedef void (*qsched_funtype)( int , void * ); __global__ void mykernel(int type, void *data, void *func){ ((qsched_funtype)func)(type, data); } int qsched_run_CUDA(int val, void *d_data, void *func) { mykernel<<<1,1>>>(val, d_data, func); cudaDeviceSynchronize(); cudaCheckErrors("kernel fail"); return 0; }

#include <stdio.h> #define DATA_VAL 5 int qsched_run_CUDA(int, void*, void*); #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0) /** Type definition for the execution function in #qsched_run. */ typedef void (*qsched_funtype)( int , void * ); __device__ void gpuTest(int type , void *data) { ((int *)data)[0] = type; } __device__ qsched_funtype function = gpuTest; int main() { void *func; cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype)); cudaCheckErrors("Failed to copy function pointer from device"); int h_data = 0; int *d_data; cudaMalloc((void **)&d_data, sizeof(int)); cudaCheckErrors("cudaMalloc fail"); cudaMemset(d_data, 0, sizeof(int)); cudaCheckErrors("cudaMemset fail"); int return_val = qsched_run_CUDA(DATA_VAL, (void *)d_data, func); if (return_val != 0) printf("return code error\n"); cudaMemcpy(&h_data, d_data, sizeof(int), cudaMemcpyDeviceToHost); cudaCheckErrors("cudaMemcpy fail"); if (h_data != DATA_VAL) {printf("Fail! %d\n", h_data); return 1;} printf("Success!\n"); return 0; }

$ nvcc -arch=sm_20 -dc kernel_lib.cu $ nvcc -lib kernel_lib.o -o test.a $ nvcc -arch=sm_20 -dc main.cu $ nvcc -arch=sm_20 main.o test.a -o test $ ./test Success! $