CUDA:cudaMemcpy仅在仿真模式下工作
我刚刚开始学习如何使用CUDA。我尝试运行一些简单的示例代码:CUDA:cudaMemcpy仅在仿真模式下工作,cuda,emulation,nvcc,Cuda,Emulation,Nvcc,我刚刚开始学习如何使用CUDA。我尝试运行一些简单的示例代码: float *ah, *bh, *ad, *bd; ah = (float *)malloc(sizeof(float)*4); bh = (float *)malloc(sizeof(float)*4); cudaMalloc((void **) &ad, sizeof(float)*4); cudaMalloc((void **) &bd, sizeof(float)*4); ... initialize ah ... /*
float *ah, *bh, *ad, *bd;
ah = (float *)malloc(sizeof(float)*4);
bh = (float *)malloc(sizeof(float)*4);
cudaMalloc((void **) &ad, sizeof(float)*4);
cudaMalloc((void **) &bd, sizeof(float)*4);
... initialize ah ...
/* copy array on device */
cudaMemcpy(ad,ah,sizeof(float)*N,cudaMemcpyHostToDevice);
cudaMemcpy(bd,ad,sizeof(float)*N,cudaMemcpyDeviceToDevice);
cudaMemcpy(bh,bd,sizeof(float)*N,cudaMemcpyDeviceToHost);
当我在仿真模式(nvcc-deviceemu)下运行时,它运行良好(并实际复制阵列)。
但当我在常规模式下运行它时,它运行w/o error,但从不复制数据。就好像cudaMemcpy线被忽略了一样
我做错了什么
多谢各位,
Jason您应该检查错误,最好是在每个malloc和memcpy上,但在最后只检查一次就足够了(
cudaGetErrorString(cudagetlastror())
)
只是为了检查显而易见的情况:
- 您确实有一个支持CUDA的GPU,对吗?运行
SDK示例,检查设备是否正常工作,以及所有驱动程序是否已安装并正常工作deviceQuery
(在memcpy中)等于4(在malloc中),对吗N
#include <cstdio>
int main( void ) {
cudaDeviceProp prop;
int count;
cudaGetDeviceCount( &count );
for (int i=0; i< count; i++) {
cudaGetDeviceProperties( &prop, i );
printf( " --- General Information for device %d ---\n", i );
printf( "Name: %s\n", prop.name );
printf( "Compute capability: %d.%d\n", prop.major, prop.minor );
printf( "Clock rate: %d\n", prop.clockRate );
printf( "Device copy overlap: " );
if (prop.deviceOverlap)
printf( "Enabled\n" );
else
printf( "Disabled\n");
printf( "Kernel execution timeout : " );
if (prop.kernelExecTimeoutEnabled)
printf( "Enabled\n" );
else
printf( "Disabled\n" );
printf( " --- Memory Information for device %d ---\n", i );
printf( "Total global mem: %ld\n", prop.totalGlobalMem );
printf( "Total constant Mem: %ld\n", prop.totalConstMem );
printf( "Max mem pitch: %ld\n", prop.memPitch );
printf( "Texture Alignment: %ld\n", prop.textureAlignment );
printf( " --- MP Information for device %d ---\n", i );
printf( "Multiprocessor count: %d\n",
prop.multiProcessorCount );
printf( "Shared mem per mp: %ld\n", prop.sharedMemPerBlock );
printf( "Registers per mp: %d\n", prop.regsPerBlock );
printf( "Threads in warp: %d\n", prop.warpSize );
printf( "Max threads per block: %d\n",
prop.maxThreadsPerBlock );
printf( "Max thread dimensions: (%d, %d, %d)\n",
prop.maxThreadsDim[0], prop.maxThreadsDim[1],
prop.maxThreadsDim[2] );
printf( "Max grid dimensions: (%d, %d, %d)\n",
prop.maxGridSize[0], prop.maxGridSize[1],
prop.maxGridSize[2] );
printf( "\n" );
}
}
#包括
内部主(空){
cudaDeviceProp支柱;
整数计数;
cudaGetDeviceCount(计数和计数);
for(int i=0;i
Oops。这似乎是cudaMalloc()的问题。它没有在设备上分配内存。为什么?您初始化了设备吗?使用cuda get last error打印status@aaa:使用运行时API(函数前缀为cuda而不是cu)表示您不需要显式初始化设备,它将在第一次cuda呼叫时连接到第一个兼容设备。@tom谢谢,对此不确定