Debugging CUDA内存检查器的粒度是多少？_Debugging_Cuda

Debugging CUDA内存检查器的粒度是多少？

debugging cuda

Debugging CUDA内存检查器的粒度是多少？,debugging,cuda,Debugging,Cuda,我有一个内核，可以在调试版本中工作，但在发布版本中失败。我怀疑我正在读或写越界，但CUDA内存检查器没有显示任何错误。因此，我做了一个测试，在测试中，我使用以下内核进行了越界写入和读取： __global__ void addKernel(int *c, const int *a, const int *b) { int x[1]; for (int i(0); i < 100; ++i) { x[i] = i; } int t(0); for (int i(0

我有一个内核，可以在调试版本中工作，但在发布版本中失败。我怀疑我正在读或写越界，但CUDA内存检查器没有显示任何错误。因此，我做了一个测试，在测试中，我使用以下内核进行了越界写入和读取：

__global__ void addKernel(int *c, const int *a, const int *b)
{
  int x[1];
  for (int i(0); i < 100; ++i) {
    x[i] = i;
  }
  int t(0);
  for (int i(0); i < 100; ++i) {
    t += x[i];
  }
  c[0] = t;
}

\uuuu全局\uuuuu无效添加内核（int*c、const int*a、const int*b）
{
int x[1]；
for（int i（0）；i<100；++i）{
x[i]=i；
}
int t（0）；
for（int i（0）；i<100；++i）{
t+=x[i]；
}
c[0]=t；
}

CUDA内存检查器未捕获越界写入和读取。通过将循环增加到1000000轮，我能够检测到越界写入，这导致内核在I=502586（2MB越界）处暂停

这是CUDA内存检查器工作的预期粒度吗？我能做些什么来让它检测小的越界写入（大约几个字节？

我想在这个示例中，您只是被优化绊倒了，正如已经建议的那样

下面是我的测试用例：

$ cat t1130.cu
#include <stdio.h>

__global__ void addKernel(int *c)
{
  int x[1];
  for (int i(0); i < 100; ++i) {
    x[i] = i;
#ifdef FORCE
  printf("%d ", i);
#endif
  }
  int t(0);
  for (int i(0); i < 100; ++i) {
    t += x[i];
  }
  c[0] = t;
}

int main(){

  int *d_c;
  cudaMalloc(&d_c, sizeof(int));
  addKernel<<<1,1>>>(d_c);
  cudaDeviceSynchronize();
}
$ nvcc -o t1130 t1130.cu
$ cuda-memcheck ./t1130
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -DFORCE -o t1130 t1130.cu
$ cuda-memcheck ./t1130
========= CUDA-MEMCHECK
========= Invalid __local__ write of size 4
=========     at 0x00000168 in addKernel(int*)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x00fffd10 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15865d]
=========     Host Frame:./t1130 [0x16ca1]
=========     Host Frame:./t1130 [0x314b3]
=========     Host Frame:./t1130 [0x27a1]
=========     Host Frame:./t1130 [0x269c]
=========     Host Frame:./t1130 [0x26b6]
=========     Host Frame:./t1130 [0x2600]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
=========     Host Frame:./t1130 [0x2489]
=========
0 1 2 3 ========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/lib64/libcuda.so.1 [0x2f31b3]
=========     Host Frame:./t1130 [0x354a6]
=========     Host Frame:./t1130 [0x2605]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
=========     Host Frame:./t1130 [0x2489]
=========
========= ERROR SUMMARY: 2 errors
$

$cat t1130.cu
#包括
__全局无效添加内核（int*c）
{
int x[1]；
for（int i（0）；i<100；++i）{
x[i]=i；
#ifdef部队
printf（“%d”，i）；
#恩迪夫
}
int t（0）；
for（int i（0）；i<100；++i）{
t+=x[i]；
}
c[0]=t；
}
int main（）{
int*d_c；
库达马洛克（d_c，sizeof（int））；
addKernel（d_-c）；
cudaDeviceSynchronize（）；
}
$nvcc-o t1130 t1130.cu
$cuda memcheck./t1130
==========CUDA-MEMCHECK
======错误摘要：0个错误
$nvcc-DFORCE-o t1130 t1130.cu
$cuda memcheck./t1130
==========CUDA-MEMCHECK
==========大小为4的本地写入无效
=========在addKernel（int*）中的0x00000168处
=======按块（0,0,0）中的线程（0,0,0）
=======地址0x00fffd10超出范围
=========在内核启动时将主机回溯保存到驱动程序入口点
=======主机帧：/lib64/libcuda.so.1（内核+0x2cd）[0x15865d]
=======主机帧：./t1130[0x16ca1]
=======主机帧：./t1130[0x314b3]
=======主机帧：./t1130[0x27a1]
=======主机帧：./t1130[0x269c]
=======主机帧：./t1130[0x26b6]
=======主机帧：./t1130[0x2600]
=======主机帧：/lib64/libc.so.6（libc\u start\u main+0xf5）[0x21d65]
=======主机帧：./t1130[0x2489]
=========
0 1 2 3=====由于CUDA API调用cudaDeviceSynchronize时出现“未指定的启动失败”，程序命中cudaErrorLaunchFailure（错误4）。
=========保存的主机回溯到错误的驱动程序入口点
=======主机帧：/lib64/libcuda.so.1[0x2f31b3]
=======主机帧：./t1130[0x354a6]
=======主机帧：./t1130[0x2605]
=======主机帧：/lib64/libc.so.6（libc\u start\u main+0xf5）[0x21d65]
=======主机帧：./t1130[0x2489]
=========
======错误摘要：2个错误
$

因此，如果我们按原样编译代码，它将在

cuda memcheck

下无误运行。但是，如果我们强制编译器生成实际的循环，那么将检测并报告无效访问

在解释您的评论时，如果我们在上述情况下使用

-G

而不是

-DFORCE

，

cuda memcheck

也会检测并报告错误，因为优化已被禁用。

我认为您在本例中只是被优化绊倒了，正如已经建议的那样

下面是我的测试用例：

$ cat t1130.cu
#include <stdio.h>

__global__ void addKernel(int *c)
{
  int x[1];
  for (int i(0); i < 100; ++i) {
    x[i] = i;
#ifdef FORCE
  printf("%d ", i);
#endif
  }
  int t(0);
  for (int i(0); i < 100; ++i) {
    t += x[i];
  }
  c[0] = t;
}

int main(){

  int *d_c;
  cudaMalloc(&d_c, sizeof(int));
  addKernel<<<1,1>>>(d_c);
  cudaDeviceSynchronize();
}
$ nvcc -o t1130 t1130.cu
$ cuda-memcheck ./t1130
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -DFORCE -o t1130 t1130.cu
$ cuda-memcheck ./t1130
========= CUDA-MEMCHECK
========= Invalid __local__ write of size 4
=========     at 0x00000168 in addKernel(int*)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x00fffd10 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15865d]
=========     Host Frame:./t1130 [0x16ca1]
=========     Host Frame:./t1130 [0x314b3]
=========     Host Frame:./t1130 [0x27a1]
=========     Host Frame:./t1130 [0x269c]
=========     Host Frame:./t1130 [0x26b6]
=========     Host Frame:./t1130 [0x2600]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
=========     Host Frame:./t1130 [0x2489]
=========
0 1 2 3 ========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/lib64/libcuda.so.1 [0x2f31b3]
=========     Host Frame:./t1130 [0x354a6]
=========     Host Frame:./t1130 [0x2605]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
=========     Host Frame:./t1130 [0x2489]
=========
========= ERROR SUMMARY: 2 errors
$

$cat t1130.cu
#包括
__全局无效添加内核（int*c）
{
int x[1]；
for（int i（0）；i<100；++i）{
x[i]=i；
#ifdef部队
printf（“%d”，i）；
#恩迪夫
}
int t（0）；
for（int i（0）；i<100；++i）{
t+=x[i]；
}
c[0]=t；
}
int main（）{
int*d_c；
库达马洛克（d_c，sizeof（int））；
addKernel（d_-c）；
cudaDeviceSynchronize（）；
}
$nvcc-o t1130 t1130.cu
$cuda memcheck./t1130
==========CUDA-MEMCHECK
======错误摘要：0个错误
$nvcc-DFORCE-o t1130 t1130.cu
$cuda memcheck./t1130
==========CUDA-MEMCHECK
==========大小为4的本地写入无效
=========在addKernel（int*）中的0x00000168处
=======按块（0,0,0）中的线程（0,0,0）
=======地址0x00fffd10超出范围
=========在内核启动时将主机回溯保存到驱动程序入口点
=======主机帧：/lib64/libcuda.so.1（内核+0x2cd）[0x15865d]
=======主机帧：./t1130[0x16ca1]
=======主机帧：./t1130[0x314b3]
=======主机帧：./t1130[0x27a1]
=======主机帧：./t1130[0x269c]
=======主机帧：./t1130[0x26b6]
=======主机帧：./t1130[0x2600]
=======主机帧：/lib64/libc.so.6（libc\u start\u main+0xf5）[0x21d65]
=======主机帧：./t1130[0x2489]
=========
0 1 2 3=====由于CUDA API调用cudaDeviceSynchronize时出现“未指定的启动失败”，程序命中cudaErrorLaunchFailure（错误4）。
=========保存的主机回溯到错误的驱动程序入口点
=======主机帧：/lib64/libcuda.so.1[0x2f31b3]
=======主机帧：./t1130[0x354a6]
=======主机帧：./t1130[0x2605]
=======主机帧：/lib64/libc.so.6（libc\u start\u main+0xf5）[0x21d65]
=======主机帧：./t1130[0x2489]
=========
======错误摘要：2个错误
$

因此，如果我们按原样编译代码，它将在

cuda memcheck

下无误运行。但是，如果我们强制编译器生成实际的循环，那么将检测并报告无效访问

在解释您的评论时，如果我们在上述情况下使用

-G

而不是

-DFORCE

，

cuda memcheck

也会检测并报告错误，因为优化被禁用。

也许编译器正在进行一次惊人的优化？结果是i从0到100的总和。发生的事情是将100作为内核参数传递（编译时未知）？似乎不太可能。不管怎么说，它还是试过了，但没有改变任何东西：）也许编译器正在执行它的一项任务