Matrix 为什么我会得到；未指明的发射失败“；在CUDA程序中，将2个矩阵相乘_Matrix_Cuda_Nsight

Matrix 为什么我会得到；未指明的发射失败“；在CUDA程序中，将2个矩阵相乘

matrix cuda

Matrix 为什么我会得到；未指明的发射失败“；在CUDA程序中，将2个矩阵相乘,matrix,cuda,nsight,Matrix,Cuda,Nsight,我是CUDA的新手。当我乘以1024x1024矩阵并启动内核时： multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size); 通过修改代码，我认为错误就在这条语句中 result += a[row * size + ind] * b[col + size * ind]; 部分 b[col+size*ind] 如果我去掉它，我就不会得到内核启动错误（

我是CUDA的新手。当我乘以1024x1024矩阵并启动内核时：

multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);

通过修改代码，我认为错误就在这条语句中

result += a[row * size + ind] * b[col + size * ind];

部分

b[col+size*ind]

如果我去掉它，我就不会得到内核启动错误（很明显，答案是错误的）。我想不出怎么了。如有任何建议，将不胜感激。我正在使用Visual Studio 2013。我正在使用调试器，但这无助于我找到错误

这似乎是一个类似的问题：

非常感谢，以下是代码：

cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size); 
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned     int size)
 {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row > size || col > size) return;

// target field in 1-D
int z = row * size + col;


int result = 0;
for (int ind = 0; ind < size  ; ++ind) {

    result += a[row * size + ind] * b[col + size * ind];

}
c[z] = result;

}

int main(){


const int sizeMatrix = 2048;
int* a = new int[sizeMatrix * sizeMatrix];
int* b = new int[sizeMatrix * sizeMatrix];
int* c = new int[sizeMatrix * sizeMatrix];



for (int i = 0; i < sizeMatrix * sizeMatrix; i++) {
    a[i] = rand() % 2;
    b[i] = rand() % 2;
}
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!");
    return 1;
}


cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
}

return 0;
}


cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned     int size)
{
int *dev_a ;
int *dev_b;
int *dev_c;
cudaError_t cudaStatus;




// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
fprintf(stdout, "device set");
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
fprintf(stdout, "buffer for c allocated \n");

cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
fprintf(stdout, "buffer for a allocated \n");

cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
fprintf(stdout, "buffer for b allocated \n");


// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
fprintf(stdout, "cudaMemcpy a done \n");


cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
fprintf(stdout, "cudaMemcpy b done\n");

fprintf(stdout, "about to launch kernel \n");


// Launch a kernel on the GPU with one thread for each element.
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);


fprintf(stdout, "kernel launched\n");


// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    ; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus));

    goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}



Error:
  cudaFree(dev_c);
  cudaFree(dev_a);
  cudaFree(dev_b);

  return cudaStatus;
}

cudaError\u t乘以cuda（int*c，const int*a，const int*b，unsigned int size）；
__全局无效多线程（int*c、常数int*a、常数int*b、无符号整数大小）
{
int row=blockIdx.y*blockDim.y+threadIdx.y；
int col=blockIdx.x*blockDim.x+threadIdx.x；
如果（行>大小|列>大小）返回；
//一维目标场
int z=行*大小+列；
int结果=0；
对于（int-ind=0；ind（开发c、开发a、开发b、大小）；
fprintf（stdout，“内核启动”\n）；
//检查启动内核时是否有任何错误
cudaStatus=cudaGetLastError（）；
if（cudaStatus！=cudaSuccess）{
；fprintf（stderr，“addKernel启动失败：%s\n”，cudaGetErrorString（cudaStatus））；
转到错误；
}
//cudaDeviceSynchronize等待内核完成，然后返回
//在启动过程中遇到的任何错误。
cudaStatus=cudaDeviceSynchronize（）；
if（cudaStatus！=cudaSuccess）{
fprintf（stderr，“cudaDeviceSynchronize在启动addKernel！\n后返回错误代码%d”，cudaStatus）；
fprintf（stderr，“%s\n”，cudaGetErrorString（cudaStatus））；
转到错误；
}
//将输出向量从GPU缓冲区复制到主机内存。
cudaStatus=cudaMemcpy（c，dev_c，size*size*sizeof（int），cudaMemcpyDeviceToHost）；
if（cudaStatus！=cudaSuccess）{
fprintf（stderr，“cudaMemcpy失败！”）；
转到错误；
}
错误：
cudaFree（开发中心）；
cudaFree（dev_a）；
cudaFree（dev_b）；
返回CUDA状态；
}

在Windows上，我右键单击了系统托盘中的NSight monitor图标。在那里我选择了选项>常规。我们看到WDDM TDR延迟。它是2，我把它增加到10。然后，我再次运行我的程序，它运行得很好。这是根据罗伯特的链接（见上文）

那么，您是否尝试过类似问题的解决方案？您可能在windows上遇到WDDM。您的代码为我正确运行（即没有运行时错误）。如果您将其构建为调试项目（很可能，因为您是在调试中运行它），那么内核将花费更长的时间。是的，就是这样。我将nsight监视器中的WDDR TDR延迟更新为10秒，现在运行正常。非常感谢，我永远也找不到它。你为什么不提供一个答案，说明你做了什么。稍后你可以回来接受你自己的答案。这样，这个问题将更有可能被未来的读者保留和理解。谢谢，我已经做到了。对不起，我对堆栈交换很不在行。

cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size); 
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned     int size)
 {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row > size || col > size) return;

// target field in 1-D
int z = row * size + col;


int result = 0;
for (int ind = 0; ind < size  ; ++ind) {

    result += a[row * size + ind] * b[col + size * ind];

}
c[z] = result;

}

int main(){


const int sizeMatrix = 2048;
int* a = new int[sizeMatrix * sizeMatrix];
int* b = new int[sizeMatrix * sizeMatrix];
int* c = new int[sizeMatrix * sizeMatrix];



for (int i = 0; i < sizeMatrix * sizeMatrix; i++) {
    a[i] = rand() % 2;
    b[i] = rand() % 2;
}
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!");
    return 1;
}


cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
}

return 0;
}


cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned     int size)
{
int *dev_a ;
int *dev_b;
int *dev_c;
cudaError_t cudaStatus;




// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
fprintf(stdout, "device set");
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
fprintf(stdout, "buffer for c allocated \n");

cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
fprintf(stdout, "buffer for a allocated \n");

cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
fprintf(stdout, "buffer for b allocated \n");


// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
fprintf(stdout, "cudaMemcpy a done \n");


cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
fprintf(stdout, "cudaMemcpy b done\n");

fprintf(stdout, "about to launch kernel \n");


// Launch a kernel on the GPU with one thread for each element.
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);


fprintf(stdout, "kernel launched\n");


// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    ; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus));

    goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}



Error:
  cudaFree(dev_c);
  cudaFree(dev_a);
  cudaFree(dev_b);

  return cudaStatus;
}