使用cuda driverAPI函数执行向量添加代码时失败
我在GPU设备上运行,驱动程序版本为5.0,使用cuda 5.0。当我完全使用驱动程序API函数执行vecAdd时,我面临着错误。我正在处理的代码发布在下面。有人能帮我解决这些错误吗使用cuda driverAPI函数执行向量添加代码时失败,cuda,Cuda,我在GPU设备上运行,驱动程序版本为5.0,使用cuda 5.0。当我完全使用驱动程序API函数执行vecAdd时,我面临着错误。我正在处理的代码发布在下面。有人能帮我解决这些错误吗 #include <stdio.h> #include <stdlib.h> #include "timer.h" #include "vecAdd-kernel.ptx.h" #include</opt/apps/cuda/5.0/include/cuda.h> #incl
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "vecAdd-kernel.ptx.h"
#include</opt/apps/cuda/5.0/include/cuda.h>
#include</opt/apps/cuda/5.0/include/cuda_runtime_api.h>
void compute_vec_add(int N, float *a, float* b, float *c);
int main(int argc,char** argv) {
static CUcontext ctx;
CUdevice dev;
CUdeviceptr d_a, d_b, d_c;
float *h_a, *h_b, *h_c, *h_temp;
int i;
int N = 1024 * 1024*8;
struct stopwatch_t* timer = NULL;
long double t_pcie_htd, t_pcie_dth, t_kernel, t_cpu;
/* Setup timers */
stopwatch_init ();
timer = stopwatch_create ();
/*
Create the vectors
*/
h_a = (float *) malloc(sizeof(float) * N);
h_b = (float *) malloc(sizeof(float) * N);
h_c = (float *) malloc(sizeof(float) * N);
/*
Set the initial values of h_a, h_b, and h_c
*/
for (i=0; i < N; i++) {
h_a[i] = (float) (rand() % 100) / 10.0;
h_b[i] = (float) (rand() % 100) / 10.0;
h_c[i] = (float) 0.0;
}
stopwatch_start (timer);
CUmodule mod;
CUfunction vecAddFunc;
cuInit(0);
cuDeviceGet(&dev, 0);
cuCtxCreate(&ctx, 0, dev);
cuModuleLoadData(&mod, (char *) imageBytes);
cuModuleGetFunction(&vecAddFunc, mod, "vecAdd");
size_t offset = 0;
cuMemAlloc(&d_a,sizeof(float) * N);
cuMemAlloc(&d_b,sizeof(float) * N);
cuMemAlloc(&d_c,sizeof(float) * N);
cuMemcpyHtoD(d_a,h_a,sizeof(float) * N);
cuMemcpyHtoD(d_b,h_b,sizeof(float) * N);
cuMemcpyHtoD(d_c,h_c,sizeof(float) * N);
cuParamSetv(vecAddFunc, offset, &d_a, sizeof(d_a));
offset += sizeof(d_a);
cuParamSetv(vecAddFunc, offset, &d_b, sizeof(d_b));
offset += sizeof(d_b);
cuParamSetv(vecAddFunc, offset, &d_c, sizeof(d_c));
offset += sizeof(d_c);
cuParamSetSize(vecAddFunc, offset);
cuFuncSetBlockShape (vecAddFunc, 256, 1, 1);
cuLaunchGrid(vecAddFunc, N/256, 1);
cuStreamSynchronize(0);
cuMemcpyDtoH(h_c,d_c,sizeof(float) * N);
t_kernel = stopwatch_stop (timer);
fprintf (stderr, "Time to execute GPU kernel: %Lg secs\n",t_kernel);
/*
Double check errors
*/
h_temp = (float *) malloc(sizeof(float) * N);
stopwatch_start (timer);
compute_vec_add (N, h_a, h_b, h_temp);
t_cpu = stopwatch_stop (timer);
fprintf (stderr, "Time to execute CPU program: %Lg secs\n",
t_cpu);
int cnt = 0;
for(int i = 0; i < N; i++) {
if(abs(h_temp[i] - h_c[i]) > 1e-5) cnt++;
}
fprintf(stderr, "number of errors: %d out of %d\n", cnt, N);
/*
Free the host memory
*/
free(h_a);
free(h_b);
free(h_c);
cuMemFree(d_a);
cuMemFree(d_b);
cuMemFree(d_c);
cuCtxDestroy(ctx);
/*
Free timer
*/
stopwatch_destroy (timer);
if(cnt == 0) {
printf("\n\nSuccess\n");
}
}
void
compute_vec_add(int N, float *a, float* b, float *c) {
int i;
for (i=0;i<N;i++)
c[i]=a[i]+b[i];
}
我使用的编译命令是
LDFLAGS = -I/usr/local/cuda/include \
-L/usr/local/cuda/lib64 \
vecAdd-dummy: vecAdd-dummy.cu timer.o vecAdd-kernel.ptx.h
nvcc -o vecAdd-dummy -arch=sm_20 vecAdd-dummy.cu timer.c ${LDFLAGS} -lcuda -g -G
vecAdd-kernel.ptx : vecAdd-kernel.cu
nvcc -arch=sm_20 -ptx $^ -o $@
vecAdd-kernel.ptx.h : vecAdd-kernel.ptx
bin2c -t "char" $^ > $@
我使用的GPU设备是特斯拉M2090,检测到Cuda API错误:返回cuModuleLoadData(0xd1) 其余错误是加载模块失败的结果
CUDA 4.0中不推荐使用cuParamSet*和cuFuncSetBlockShaper函数。修复模块加载问题后,我建议您查看内核函数。我了解问题在于cuModuleLoadData。在编译时,我使用-arch=sm_20(我在问题中更新的所有编译命令)进行编译。我仍然面临着这个问题。我是否缺少任何其他设备配置信息?如何解决错误CUDA_error_NO_BINARY_FOR_GPU=209?我怀疑您的错误与imageBytes有关。您是否尝试过将模块作为立方体加载或将PTX作为字符串传递(不使用bin2c)?我建议您将问题编辑为最小可重复性(删除所有无意义的内容),并指定您正在使用的bin2c或显示imageBytes的实际定义。
LDFLAGS = -I/usr/local/cuda/include \
-L/usr/local/cuda/lib64 \
vecAdd-dummy: vecAdd-dummy.cu timer.o vecAdd-kernel.ptx.h
nvcc -o vecAdd-dummy -arch=sm_20 vecAdd-dummy.cu timer.c ${LDFLAGS} -lcuda -g -G
vecAdd-kernel.ptx : vecAdd-kernel.cu
nvcc -arch=sm_20 -ptx $^ -o $@
vecAdd-kernel.ptx.h : vecAdd-kernel.ptx
bin2c -t "char" $^ > $@
/**
* This indicates that there is no kernel image available that is suitable
* for the device. This can occur when a user specifies code generation
* options for a particular CUDA source file that do not include the
* corresponding device configuration.
*/
CUDA_ERROR_NO_BINARY_FOR_GPU = 209,