群集上的矢量加法Cuda程序存在许多错误

群集上的矢量加法Cuda程序存在许多错误,cuda,nvidia,Cuda,Nvidia,我试图在特斯拉K20服务器上运行一个Cuda矢量加法程序,我得到了很多错误。我正在提交代码 #include <stdio.h> #include <stdlib.h> #include "cuda_utils.h" #include "timer.h" /* * **CUDA KERNEL** * * Compute the sum of two vectors * C[i] = A[i] + B[i] * */ __global__ void vecAdd

我试图在特斯拉K20服务器上运行一个Cuda矢量加法程序,我得到了很多错误。我正在提交代码

#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "timer.h"
/*
* **CUDA KERNEL** 
* 
* Compute the sum of two vectors 
*   C[i] = A[i] + B[i]
* 
*/
__global__ void vecAdd(float* a, float* b, float* c) {

/* Calculate index for this thread */
  int i = blockIdx.x * blockDim.x + threadIdx.x;

 /* Compute the element of C */
 c[i] = a[i] + b[i];
 }

 void compute_vec_add(int N, float *a, float* b, float *c);

/*
* 
* Host code to drive the CUDA Kernel
* 
*/
int main() {

float *d_a, *d_b, *d_c;
float *h_a, *h_b, *h_c, *h_temp;
int i;
int N = 1024 * 1024 * 512;

struct stopwatch_t* timer = NULL;
long double t_pcie_htd, t_pcie_dth, t_kernel, t_cpu;

/* Setup timers */
stopwatch_init();
timer = stopwatch_create();

/*
Create the vectors
*/
h_a = (float *) malloc(sizeof(float) * N);
h_b = (float *) malloc(sizeof(float) * N);
h_c = (float *) malloc(sizeof(float) * N);

/*
 Set the initial values of h_a, h_b, and h_c
 */
for (i = 0; i < N; i++) {
h_a[i] = (float) (rand() % 100) / 10.0;
h_b[i] = (float) (rand() % 100) / 10.0;
h_c[i] = 0.0;
}

/*
Allocate space on the GPU
*/
CUDA_CHECK_ERROR(cudaMalloc(&d_a, sizeof(float) * N));
CUDA_CHECK_ERROR(cudaMalloc(&d_b, sizeof(float) * N));
CUDA_CHECK_ERROR(cudaMalloc(&d_c, sizeof(float) * N));

/*
Copy d_a and d_b from CPU to GPU
*/
stopwatch_start(timer);
CUDA_CHECK_ERROR(cudaMemcpy(d_a, h_a, sizeof(float) * N, cudaMemcpyHostToDevice));
CUDA_CHECK_ERROR(cudaMemcpy(d_b, h_b, sizeof(float) * N, cudaMemcpyHostToDevice));
t_pcie_htd = stopwatch_stop(timer);
fprintf(stderr, "Time to transfer data from host to device: %Lg secs\n",t_pcie_htd);

/*
Run N/256 blocks of 256 threads each
*/
dim3 GS(N / 256, 1, 1);
dim3 BS(256, 1, 1);

stopwatch_start(timer);
vecAdd<<<GS, BS>>>(d_a, d_b, d_c);
cudaThreadSynchronize();
t_kernel = stopwatch_stop(timer);
fprintf(stderr, "Time to execute GPU kernel: %Lg secs\n", t_kernel);

/*
Copy d_cfrom GPU to CPU
*/
stopwatch_start(timer);
CUDA_CHECK_ERROR(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
t_pcie_dth = stopwatch_stop(timer);
fprintf(stderr, "Time to transfer data from device to host: %Lg secs\n",t_pcie_dth);

/* 
 Double check errors
 */
h_temp = (float *) malloc(sizeof(float) * N);
stopwatch_start(timer);
compute_vec_add(N, h_a, h_b, h_temp);
t_cpu = stopwatch_stop(timer);
fprintf(stderr, "Time to execute CPU program: %Lg secs\n", t_cpu);

int cnt = 0;
for (int i = 0; i < N; i++) {
if (abs(h_temp[i] - h_c[i]) > 1e-5)
  cnt++;
}
fprintf(stderr, "number of errors: %d out of %d\n", cnt, N);

/*
 Free the device memory
*/
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

/*
 Free the host memory
*/
free(h_a);
free(h_b);
free(h_c);

/* 
 Free timer 
*/
stopwatch_destroy(timer);

if (cnt == 0) {
printf("\n\nSuccess\n");
}
}
然后我得到的错误是:

/tmp/tmpxft_000014db_00000000-17_vecAdd.o: In function `main':
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x7e): undefined reference to `stopwatch_init()'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x83): undefined reference to `stopwatch_create()'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x278): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x2ff): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x380): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x3dc): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x416): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x45e): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x4b0): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x4de): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x61f): undefined reference to `stopwatch_destroy(stopwatch_t*)'
collect2: error: ld returned 1 exit status
有人能解释一下为什么会出现这些错误吗。而且我是Cuda编程的初学者。
我猜它与链接有关系。

< >代码> NVCC < /代码>解释<代码> Cu/Cuth>代码作为C++代码,这将导致与符号名称的冲突。解决方案是在
vecAdd.cu
中用
extern“C”{}
括起
#包括“timer.h”

问题是,如果.Cu文件包含另一个.c文件的C函数,那么函数.c,这些函数将被解释为C++函数,而它又将为那些函数< /强>设置特殊的符号名。稍后,在编译function.c时,将使用普通符号名编译这些函数。在链接阶段,由于.cu文件中的符号名称与编译后的函数.o文件中的符号名称不匹配,因此将得到未解析的引用。因此,您需要在包含外部C代码函数的头周围使用extern“C”{}语法

(引自)


验证

编译问题中提供的
vecAdd.cu
nvcc-c vecAdd.cu
并使用
nm vecAdd.o
列出符号,打印以下行:

...
00000000000007cf t _Z10cudaLaunchIcE9cudaErrorPT_
00000000000007aa t _Z10cudaMallocIfE9cudaErrorPPT_m
                 U _Z14stopwatch_initv
                 U _Z14stopwatch_stopP11stopwatch_t
0000000000000016 T _Z15compute_vec_addiPfS_S_
                 U _Z15stopwatch_startP11stopwatch_t
                 U _Z16stopwatch_createv
                 U _Z17stopwatch_destroyP11stopwatch_t
0000000000000672 T _Z29__device_stub__Z6vecAddPfS_S_PfS_S_
0000000000000703 T _Z6vecAddPfS_S_
...
您可以看到,
stopwatch\u init
变成了
\u Z14stopwatch\u iniv
,依此类推

由于对
timer.c
timer.h
没有定义,所以我为它们编写了一个最小的代码

// timer.h
struct stopwatch_t { double t; };
void stopwatch_init();
struct stopwatch_t *stopwatch_create();
void stopwatch_start(struct stopwatch_t *timer);
long double stopwatch_stop(struct stopwatch_t *timer);
void stopwatch_destroy(struct stopwatch_t *timer);

// timer.c
void stopwatch_init() { }
struct stopwatch_t *stopwatch_create() { return 0; }
void stopwatch_start(struct stopwatch_t *timer) { }
long double stopwatch_stop(struct stopwatch_t *timer) { return 0; }
void stopwatch_destroy(struct stopwatch_t *timer) { }
使用上述代码,
nvcc-c timer.c
nm timer.o
生成:

0000000000000007 T stopwatch_create
0000000000000029 T stopwatch_destroy
0000000000000000 T stopwatch_init
0000000000000012 T stopwatch_start
000000000000001d T stopwatch_stop
...
                 U __stack_chk_fail
                 U stderr
                 U stopwatch_create
                 U stopwatch_destroy
                 U stopwatch_init
                 U stopwatch_start
                 U stopwatch_stop
00000000000007cf t _Z10cudaLaunchIcE9cudaErrorPT_
00000000000007aa t _Z10cudaMallocIfE9cudaErrorPPT_m
...
您可以看到,
timer.c
函数将出现符号名称冲突

修改后
vecAdd.cu
到:

#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
extern "C" {
    #include "timer.h"
}
...
您可以看到C函数的符号名称没有变化。在本例中,问题中的compile命令:
nvcc vecad.cu timer.c-o vecAdd
将起作用


编辑

如OP注释中所提到的,<代码> G++-C计时器.C<代码> >代码> NVCC VECAD.CuTime.O-OVCADAD./COD>也会起作用,因为代码> G++< /COD>将默认处理<代码> .c/Cuth>文件作为C++代码。

g++-c timer.c
nm timer.o
打印:

0000000000000000 T _Z14stopwatch_initv
000000000000001d T _Z14stopwatch_stopP11stopwatch_t
0000000000000012 T _Z15stopwatch_startP11stopwatch_t
0000000000000007 T _Z16stopwatch_createv
0000000000000029 T _Z17stopwatch_destroyP11stopwatch_t

您的编译器不知道什么是
stopwatch\u init()
。它是在哪里定义的?它是在timer.c中定义的,我正在使用它进行编译。好吧,这可能有点问题,因为编译器明确地说“未定义”编译命令不应该不同,我的意思是
vecAdd.cu
timer.c
之后?CUDA不是c!非常感谢。我得到了它。timer.c的编译有问题,我使用了gcc而不是g++。它解决了这个问题。
0000000000000000 T _Z14stopwatch_initv
000000000000001d T _Z14stopwatch_stopP11stopwatch_t
0000000000000012 T _Z15stopwatch_startP11stopwatch_t
0000000000000007 T _Z16stopwatch_createv
0000000000000029 T _Z17stopwatch_destroyP11stopwatch_t