cuda错误4未指定的启动失败
以下是GPU内核片段:cuda错误4未指定的启动失败,cuda,Cuda,以下是GPU内核片段: __global_ void POCKernel(int *a) { int i = threadIdx.x; a[i] = a[i] + 1; if (i < 1024 * 1024) { double dblNewMemoryVarA[15]; double dblNewMemoryVarB[15]; double dblNewMemoryVarC[15]; //double* dblNewMemoryVarA = (dou
__global_ void POCKernel(int *a)
{
int i = threadIdx.x;
a[i] = a[i] + 1;
if (i < 1024 * 1024)
{
double dblNewMemoryVarA[15];
double dblNewMemoryVarB[15];
double dblNewMemoryVarC[15];
//double* dblNewMemoryVarA = (double*)malloc(15 * sizeof(double));
////memset(dblNewMemoryVarA, 0, 15 * sizeof(double));
//double* dblNewMemoryVarB = (double*)malloc(15 * sizeof(double));
////memset(dblNewMemoryVarB, 0, 15 * sizeof(double));
//double* dblNewMemoryVarC = (double*)malloc(15 * sizeof(double));
////memset(dblNewMemoryVarC, 0, 15 * sizeof(double));
for (int j = 0; j < 15; j++)
{
dblNewMemoryVarA[j] = 0;
dblNewMemoryVarB[j] = 0;
dblNewMemoryVarC[j] = 0;
}
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] + dblNewMemoryVarB[i];
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] * dblNewMemoryVarB[i];
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] - dblNewMemoryVarB[i];
/*free(dblNewMemoryVarA);
free(dblNewMemoryVarB);
free(dblNewMemoryVarC);*/
}
}
\uuuu全局\uuuvoid内核(int*a)
{
int i=threadIdx.x;
a[i]=a[i]+1;
如果(i<1024*1024)
{
双dblNewMemoryVarA[15];
双dblNewMemoryVarB[15];
双dblNewMemoryVarC[15];
//double*dblNewMemoryVarA=(double*)malloc(15*sizeof(double));
////memset(dblNewMemoryVarA,0,15*sizeof(double));
//double*dblNewMemoryVarB=(double*)malloc(15*sizeof(double));
////memset(dblNewMemoryVarB,0,15*sizeof(double));
//double*dblNewMemoryVarC=(double*)malloc(15*sizeof(double));
////memset(dblNewMemoryVarC,0,15*sizeof(double));
对于(int j=0;j<15;j++)
{
dblNewMemoryVarA[j]=0;
dblNewMemoryVarB[j]=0;
dblNewMemoryVarC[j]=0;
}
dblNewMemoryVarC[i]=dblNewMemoryVarA[i]+dblNewMemoryVarB[i];
dblNewMemoryVarC[i]=dblNewMemoryVarA[i]*dblNewMemoryVarB[i];
dblNewMemoryVarC[i]=dblNewMemoryVarA[i]-dblNewMemoryVarB[i];
/*免费(dblNewMemoryVarA);
免费(dblNewMemoryVarB);
免费(dblNewMemoryVarC)*/
}
}
此内核的调用函数是:
int main()
{
const int arraySize = 1024 * 1024;
int* a = new int[arraySize];
int *dev_a = 0;
for (int i = 0; i < arraySize; i++)
{
a[i] = 5;
}
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "CUDA failed!");
return 1;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_a, arraySize * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
POCKernel << <4096, 256 >> >(dev_a);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
Error:
cudaFree(dev_a);
return 0;
}
}
intmain()
{
常量int arraySize=1024*1024;
int*a=新的int[arraySize];
int*dev_a=0;
for(int i=0;i(dev_a);
//检查启动内核时是否有任何错误
cudaStatus=cudaGetLastError();
if(cudaStatus!=cudaSuccess){
fprintf(stderr,“addKernel启动失败:%s\n”,cudaGetErrorString(cudaStatus));
转到错误;
}
//cudaDeviceSynchronize等待内核完成,然后返回
//在启动过程中遇到的任何错误。
cudaStatus=cudaDeviceSynchronize();
if(cudaStatus!=cudaSuccess){
fprintf(stderr,“cudaDeviceSynchronize在启动addKernel!\n后返回错误代码%d”,cudaStatus);
转到错误;
//必须在退出之前调用cudaDeviceReset,以便进行分析和
//跟踪工具,如Nsight和visualprofiler,用于显示完整的跟踪。
cudaStatus=cudadeviceset();
if(cudaStatus!=cudaSuccess){
fprintf(stderr,“cudaDeviceReset失败!”);
返回1;
}
错误:
cudaFree(dev_a);
返回0;
}
}
在cudaDeviceSynchronize上,错误代码为4-未指定的启动失败。
有人能告诉我为什么我要面对这个问题吗?这段代码在很多方面都很奇怪,但让我们开门见山吧。以下几行内核代码中存在一个明确的问题:
int i = threadIdx.x;
...
if (i < 1024 * 1024)
{
double dblNewMemoryVarA[15];
double dblNewMemoryVarB[15];
double dblNewMemoryVarC[15];
...
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] + dblNewMemoryVarB[i];
在局部变量中,已为15个量分配了空间:
double dblNewMemoryVarA[15];
但您随后尝试使用i
索引这些数组,如前所述,其范围将高达255:
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] + dblNewMemoryVarB[i];
因此,这将生成越界索引,这很可能导致内核启动失败
因为您既没有提供完整的代码,也没有说明如何编译或在什么环境中运行,所以无法确定。但从代码正确性的角度来看,上述行为无疑是非法的
我猜您是在调试模式下编译的(-G
)。如果不是,我希望编译器在If测试之后优化所有内容,因为这些代码都不会影响任何全局状态
正如评论中所指出的,如果您运行的是这一个windows,则可能只是运行windows WDDM超时。这段代码在许多方面都很奇怪,但让我们直截了当。以下几行内核代码中存在一个明确的问题:
int i = threadIdx.x;
...
if (i < 1024 * 1024)
{
double dblNewMemoryVarA[15];
double dblNewMemoryVarB[15];
double dblNewMemoryVarC[15];
...
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] + dblNewMemoryVarB[i];
在局部变量中,已为15个量分配了空间:
double dblNewMemoryVarA[15];
但您随后尝试使用i
索引这些数组,如前所述,其范围将高达255:
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] + dblNewMemoryVarB[i];
因此,这将生成越界索引,这很可能导致内核启动失败
因为您既没有提供完整的代码,也没有说明如何编译或在什么环境中运行,所以无法确定。但从代码正确性的角度来看,上述行为无疑是非法的
我猜您是在调试模式下编译的(-G
)。如果不是,我希望编译器在If测试之后优化所有内容,因为这些代码都不会影响任何全局状态
正如评论中所指出的,如果您运行的是这一个windows,可能只是因为您正在运行windows WDDM超时。可能4096个块太多了。您的设备限制是什么?如果您在Windows上运行,您应该检查NSIGHT的WDDM TDR设置。这可能是因为你的内核失败了。可能4096块太多了。您的设备限制是什么?如果您在Windows上运行,您应该检查NSIGHT的WDDM TDR设置。你的内核可能因此而失败。谢谢罗伯特。我理解这个问题。我想了解CUDA内核中可以分配的最大内存量是多少。你能帮我解决这个问题吗?这似乎是另一个问题。如果您有新问题,我建议您发布新问题。如果在kernel
malloc
中使用动态分配,则限制堆大小。这是一篇感谢罗伯特的文章。我理解这个问题。我想知道是什么