程序启动时调用cudaDeviceSynchronize()后出现未指定的启动失败。但使用逐步调试不会出现错误。库达
我花了几个小时与程序启动时调用cudaDeviceSynchronize()后出现未指定的启动失败。但使用逐步调试不会出现错误。库达,cuda,visual-studio-debugging,Cuda,Visual Studio Debugging,我花了几个小时与未指明的启动失败进行斗争。 为了了解共享内存是如何工作的,我为自己想出了一个小任务 任务是将数组[1,2,3,…,N]分成K组(N/K)个元素,并求每组的和。(数组的当前元素和上一个元素之间的差值等于1) 我计划在K个块之间划分的网格中使用N个线程。因此,每个threadblock都包含(N/K)个线程。因此,一个threadblock可以用来计算一个组的和。我还想动态分配共享内存 启动程序时,在调用cudaDeviceSynchronize()之后,我遇到了未指定的启动失败。但
未指明的启动失败进行斗争
。
为了了解共享内存是如何工作的,我为自己想出了一个小任务
任务是将数组[1,2,3,…,N]分成K组(N/K)个元素,并求每组的和。(数组的当前元素和上一个元素之间的差值等于1)
我计划在K个块之间划分的网格中使用N个线程。因此,每个threadblock都包含(N/K)个线程。因此,一个threadblock可以用来计算一个组的和。我还想动态分配共享内存
启动程序时,在调用cudaDeviceSynchronize()
之后,我遇到了未指定的启动失败。但当我尝试一步一步地调试时,一切都很好
我做错了什么?(Visual Studio 2012专业版,计算能力2.1)我非常感谢您的帮助
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
extern __shared__ double shrd[];
__global__ void kernel(double * a){
size_t threadID_block = blockDim.x * threadIdx.y + threadIdx.x;
size_t blockID_global = (gridDim.x * blockIdx.y + blockIdx.x );
size_t threadID_global = blockID_global * blockDim.x * blockDim.y + threadID_block;
double * temp = &shrd[blockID_global * blockDim.x * blockDim.y];
temp[threadID_block] = static_cast<double>(threadID_global);
__syncthreads();
if (threadID_block == 0){
a[blockID_global] = 0.0;
for (size_t index = 0; index < blockDim.x * blockDim.y; index++){
a[blockID_global] += temp[index];
}
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
dim3 gridSize(2,2,1);
dim3 blockSize(4,4,1);
double * dev_a = NULL;
size_t length = gridSize.x * gridSize.y ;
size_t byteSize = length * sizeof(double);
CUDA_CALL(cudaMalloc(&dev_a,byteSize));
size_t shmem_perBlock = blockSize.x * blockSize.y * sizeof(double);
kernel <<< gridSize, blockSize, shmem_perBlock >>> (dev_a);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
double * a = new double [length];
CUDA_CALL(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));
for (size_t index = 0; index < length; index++){
printf("%.3f\n",a[index]);
}
printf("\n");
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaDeviceReset());
delete[]a;
system("pause");
return 0;
}
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#定义CUDA_调用(x)do{if((x)!=cudaSuccess){\
printf(“在%s处出现错误:%d\n”,\uuuuu文件\uuuuu,\uuuu行\uuuuu)\
printf(“%s\n”,cudaGetErrorString(x))\
系统(“暂停”)\
返回EXIT_FAILURE;}}while(0)
外部共享双shrd[];
__全局无效内核(双*a){
大小\u t threadID\u block=blockDim.x*threadIdx.y+threadIdx.x;
大小\u t块ID\u全局=(gridDim.x*blockIdx.y+blockIdx.x);
size\u t threadID\u global=blockID\u global*blockDim.x*blockDim.y+threadID\u block;
double*temp=&shrd[blockID_global*blockDim.x*blockDim.y];
temp[threadID\u block]=静态线程转换(threadID\u全局);
__同步线程();
如果(线程ID_块==0){
a[blockID_global]=0.0;
对于(大小索引=0;索引(dev_a);
CUDA_调用(cudaGetLastError());
CUDA_调用(cudaDeviceSynchronize());
double*a=新的double[长度];
CUDA_调用(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));
对于(大小索引=0;索引<长度;索引++){
printf(“%.3f\n”,一个[索引]);
}
printf(“\n”);
CUDA_CALL(cudaFree(dev_a));
CUDA_调用(cudadeviceset());
删除[]a;
系统(“暂停”);
返回0;
}
如果您使用开普勒或更高版本,请先阅读以下内容:
否则,如果您是开普勒之前的人,请阅读以下内容:
在CUDA编程方面,您缺少一些基础知识。我在下面给了你一个代码模板。这是为了澄清其中一些基本原则。不要期望这会被优化,因为我期望您对并行缩减进行编程。这将使您了解如何使用共享内存
祝你好运
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 10000
#define K 100
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
__global__ void kernel(double* a, double* results){
extern __shared__ double shared[];
size_t tid, tid_local, stride;
tid = blockDim.x*blockIdx.x+threadIdx.x; //thread id within all blocks
tid_local = threadIdx.x; //thread id within a block
stride = blockDim.x*gridDim.x; //total number of threads
double *start = &a[K*blockIdx.x]; //each block will get K of a block.
shared[tid_local]=start[tid_local]; //copy K elements into shared memory
__syncthreads();
//Perform Parallel reduction, you will have to implement this
//After parallel reduction, result should be in shared[0]
//for demonstration I made the code serial for each block on thread 0.
//This is for demonstration only.
double sum=0;
if(tid_local==0){
for(int i=0; i<K; i++){
sum+=shared[i];
}
a[blockIdx.x]=sum;
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
double * dev_a = NULL;
double * dev_results=NULL;
CUDA_CALL(cudaMalloc(&dev_a, N*sizeof(double) ));
CUDA_CALL(cudaMalloc(&dev_results, (N/K)*sizeof(double)));
//copy dev_a onto GPU (this is the array you are summing).
dim3 block_size(K, 1, 1);
dim3 grid_size (N/K, 1, 1);
size_t shmem_perBlock = K * sizeof(double);
kernel <<< grid_size, block_size, shmem_perBlock >>> (dev_a, dev_results);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
//copy dev_results back to CPU, this is your result.
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaFree(dev_results));
system("pause");
return 0;
}
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#定义N 10000
#定义K100
#定义CUDA_调用(x)do{if((x)!=cudaSuccess){\
printf(“在%s处出现错误:%d\n”,\uuuuu文件\uuuuu,\uuuu行\uuuuu)\
printf(“%s\n”,cudaGetErrorString(x))\
系统(“暂停”)\
返回EXIT_FAILURE;}}while(0)
__全局无效内核(双*a,双*results){
外部共享双共享[];
大小tid、tid本地、步幅;
tid=blockDim.x*blockIdx.x+threadIdx.x;//所有块内的线程id
tid_local=threadIdx.x;//块内的线程id
stride=blockDim.x*gridDim.x;//线程总数
double*start=&a[K*blockIdx.x];//每个块将获得一个块的K。
共享的[tid_local]=开始[tid_local];//将K个元素复制到共享内存中
__同步线程();
//如果要执行并行缩减,则必须执行此操作
//并行归约后,结果应为共享[0]
//为了演示,我对线程0上的每个块进行了代码串行。
//这仅用于演示。
双和=0;
如果(tid_local==0){
for(int i=0;i(dev_a,dev_results);
CUDA_调用(cudaGetLastError());
CUDA_调用(cudaDeviceSynchronize());
//将开发结果复制回CPU,这是您的结果。
CUDA_CALL(cudaFree(dev_a));
CUDA_呼叫(cudaFree(开发结果));
系统(“暂停”);
返回0;
}
你试过cudaMemchk吗?cuda memcheck
报告了大量的越界共享内存写入。简单看一下代码就知道这是由于temp
@talonmies,@Park Young Bae,我试过cuda memcheck中的越界指针造成的。结果是48个错误,比如无效的共享内存写入大小为8
,2个错误Progam在CUDA API调用cudaDeviceSynchronize时出错30,在cudaModuleUnload调用cudaModuleUnload时出错719。我猜这是因为异步程序执行:内核启动,尽管外部共享\uuuuuuuu的内存是双shrd[]
未分配。而且我是CUDA的新手,所以我可能错了。有什么方法可以修复它吗?我的意思是让内核执行等待到extern\uuuuu shared\uuuuuu双shrd[]的内存
is allocated?@a.yuzhanin:代码的问题主要是对共享内存的越界内存访问。我强烈建议首先解决这个问题。