Debugging CUDA内存检查器的粒度是多少?

Debugging CUDA内存检查器的粒度是多少?,debugging,cuda,Debugging,Cuda,我有一个内核,可以在调试版本中工作,但在发布版本中失败。我怀疑我正在读或写越界,但CUDA内存检查器没有显示任何错误。因此,我做了一个测试,在测试中,我使用以下内核进行了越界写入和读取: __global__ void addKernel(int *c, const int *a, const int *b) { int x[1]; for (int i(0); i < 100; ++i) { x[i] = i; } int t(0); for (int i(0

我有一个内核,可以在调试版本中工作,但在发布版本中失败。我怀疑我正在读或写越界,但CUDA内存检查器没有显示任何错误。因此,我做了一个测试,在测试中,我使用以下内核进行了越界写入和读取:

__global__ void addKernel(int *c, const int *a, const int *b)
{
  int x[1];
  for (int i(0); i < 100; ++i) {
    x[i] = i;
  }
  int t(0);
  for (int i(0); i < 100; ++i) {
    t += x[i];
  }
  c[0] = t;
}
\uuuu全局\uuuuu无效添加内核(int*c、const int*a、const int*b)
{
int x[1];
for(int i(0);i<100;++i){
x[i]=i;
}
int t(0);
for(int i(0);i<100;++i){
t+=x[i];
}
c[0]=t;
}
CUDA内存检查器未捕获越界写入和读取。通过将循环增加到1000000轮,我能够检测到越界写入,这导致内核在I=502586(2MB越界)处暂停


这是CUDA内存检查器工作的预期粒度吗?我能做些什么来让它检测小的越界写入(大约几个字节?

我想在这个示例中,您只是被优化绊倒了,正如已经建议的那样

下面是我的测试用例:

$ cat t1130.cu
#include <stdio.h>

__global__ void addKernel(int *c)
{
  int x[1];
  for (int i(0); i < 100; ++i) {
    x[i] = i;
#ifdef FORCE
  printf("%d ", i);
#endif
  }
  int t(0);
  for (int i(0); i < 100; ++i) {
    t += x[i];
  }
  c[0] = t;
}

int main(){

  int *d_c;
  cudaMalloc(&d_c, sizeof(int));
  addKernel<<<1,1>>>(d_c);
  cudaDeviceSynchronize();
}
$ nvcc -o t1130 t1130.cu
$ cuda-memcheck ./t1130
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -DFORCE -o t1130 t1130.cu
$ cuda-memcheck ./t1130
========= CUDA-MEMCHECK
========= Invalid __local__ write of size 4
=========     at 0x00000168 in addKernel(int*)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x00fffd10 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15865d]
=========     Host Frame:./t1130 [0x16ca1]
=========     Host Frame:./t1130 [0x314b3]
=========     Host Frame:./t1130 [0x27a1]
=========     Host Frame:./t1130 [0x269c]
=========     Host Frame:./t1130 [0x26b6]
=========     Host Frame:./t1130 [0x2600]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
=========     Host Frame:./t1130 [0x2489]
=========
0 1 2 3 ========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/lib64/libcuda.so.1 [0x2f31b3]
=========     Host Frame:./t1130 [0x354a6]
=========     Host Frame:./t1130 [0x2605]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
=========     Host Frame:./t1130 [0x2489]
=========
========= ERROR SUMMARY: 2 errors
$
$cat t1130.cu
#包括
__全局无效添加内核(int*c)
{
int x[1];
for(int i(0);i<100;++i){
x[i]=i;
#ifdef部队
printf(“%d”,i);
#恩迪夫
}
int t(0);
for(int i(0);i<100;++i){
t+=x[i];
}
c[0]=t;
}
int main(){
int*d_c;
库达马洛克(d_c,sizeof(int));
addKernel(d_-c);
cudaDeviceSynchronize();
}
$nvcc-o t1130 t1130.cu
$cuda memcheck./t1130
==========CUDA-MEMCHECK
======错误摘要:0个错误
$nvcc-DFORCE-o t1130 t1130.cu
$cuda memcheck./t1130
==========CUDA-MEMCHECK
==========大小为4的本地写入无效
=========在addKernel(int*)中的0x00000168处
=======按块(0,0,0)中的线程(0,0,0)
=======地址0x00fffd10超出范围
=========在内核启动时将主机回溯保存到驱动程序入口点
=======主机帧:/lib64/libcuda.so.1(内核+0x2cd)[0x15865d]
=======主机帧:./t1130[0x16ca1]
=======主机帧:./t1130[0x314b3]
=======主机帧:./t1130[0x27a1]
=======主机帧:./t1130[0x269c]
=======主机帧:./t1130[0x26b6]
=======主机帧:./t1130[0x2600]
=======主机帧:/lib64/libc.so.6(libc\u start\u main+0xf5)[0x21d65]
=======主机帧:./t1130[0x2489]
=========
0 1 2 3=====由于CUDA API调用cudaDeviceSynchronize时出现“未指定的启动失败”,程序命中cudaErrorLaunchFailure(错误4)。
=========保存的主机回溯到错误的驱动程序入口点
=======主机帧:/lib64/libcuda.so.1[0x2f31b3]
=======主机帧:./t1130[0x354a6]
=======主机帧:./t1130[0x2605]
=======主机帧:/lib64/libc.so.6(libc\u start\u main+0xf5)[0x21d65]
=======主机帧:./t1130[0x2489]
=========
======错误摘要:2个错误
$
因此,如果我们按原样编译代码,它将在
cuda memcheck
下无误运行。但是,如果我们强制编译器生成实际的循环,那么将检测并报告无效访问


在解释您的评论时,如果我们在上述情况下使用
-G
而不是
-DFORCE
cuda memcheck
也会检测并报告错误,因为优化已被禁用。

我认为您在本例中只是被优化绊倒了,正如已经建议的那样

下面是我的测试用例:

$ cat t1130.cu
#include <stdio.h>

__global__ void addKernel(int *c)
{
  int x[1];
  for (int i(0); i < 100; ++i) {
    x[i] = i;
#ifdef FORCE
  printf("%d ", i);
#endif
  }
  int t(0);
  for (int i(0); i < 100; ++i) {
    t += x[i];
  }
  c[0] = t;
}

int main(){

  int *d_c;
  cudaMalloc(&d_c, sizeof(int));
  addKernel<<<1,1>>>(d_c);
  cudaDeviceSynchronize();
}
$ nvcc -o t1130 t1130.cu
$ cuda-memcheck ./t1130
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -DFORCE -o t1130 t1130.cu
$ cuda-memcheck ./t1130
========= CUDA-MEMCHECK
========= Invalid __local__ write of size 4
=========     at 0x00000168 in addKernel(int*)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x00fffd10 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15865d]
=========     Host Frame:./t1130 [0x16ca1]
=========     Host Frame:./t1130 [0x314b3]
=========     Host Frame:./t1130 [0x27a1]
=========     Host Frame:./t1130 [0x269c]
=========     Host Frame:./t1130 [0x26b6]
=========     Host Frame:./t1130 [0x2600]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
=========     Host Frame:./t1130 [0x2489]
=========
0 1 2 3 ========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/lib64/libcuda.so.1 [0x2f31b3]
=========     Host Frame:./t1130 [0x354a6]
=========     Host Frame:./t1130 [0x2605]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
=========     Host Frame:./t1130 [0x2489]
=========
========= ERROR SUMMARY: 2 errors
$
$cat t1130.cu
#包括
__全局无效添加内核(int*c)
{
int x[1];
for(int i(0);i<100;++i){
x[i]=i;
#ifdef部队
printf(“%d”,i);
#恩迪夫
}
int t(0);
for(int i(0);i<100;++i){
t+=x[i];
}
c[0]=t;
}
int main(){
int*d_c;
库达马洛克(d_c,sizeof(int));
addKernel(d_-c);
cudaDeviceSynchronize();
}
$nvcc-o t1130 t1130.cu
$cuda memcheck./t1130
==========CUDA-MEMCHECK
======错误摘要:0个错误
$nvcc-DFORCE-o t1130 t1130.cu
$cuda memcheck./t1130
==========CUDA-MEMCHECK
==========大小为4的本地写入无效
=========在addKernel(int*)中的0x00000168处
=======按块(0,0,0)中的线程(0,0,0)
=======地址0x00fffd10超出范围
=========在内核启动时将主机回溯保存到驱动程序入口点
=======主机帧:/lib64/libcuda.so.1(内核+0x2cd)[0x15865d]
=======主机帧:./t1130[0x16ca1]
=======主机帧:./t1130[0x314b3]
=======主机帧:./t1130[0x27a1]
=======主机帧:./t1130[0x269c]
=======主机帧:./t1130[0x26b6]
=======主机帧:./t1130[0x2600]
=======主机帧:/lib64/libc.so.6(libc\u start\u main+0xf5)[0x21d65]
=======主机帧:./t1130[0x2489]
=========
0 1 2 3=====由于CUDA API调用cudaDeviceSynchronize时出现“未指定的启动失败”,程序命中cudaErrorLaunchFailure(错误4)。
=========保存的主机回溯到错误的驱动程序入口点
=======主机帧:/lib64/libcuda.so.1[0x2f31b3]
=======主机帧:./t1130[0x354a6]
=======主机帧:./t1130[0x2605]
=======主机帧:/lib64/libc.so.6(libc\u start\u main+0xf5)[0x21d65]
=======主机帧:./t1130[0x2489]
=========
======错误摘要:2个错误
$
因此,如果我们按原样编译代码,它将在
cuda memcheck
下无误运行。但是,如果我们强制编译器生成实际的循环,那么将检测并报告无效访问


在解释您的评论时,如果我们在上述情况下使用
-G
而不是
-DFORCE
cuda memcheck
也会检测并报告错误,因为优化被禁用。

也许编译器正在进行一次惊人的优化?结果是i从0到100的总和。发生的事情是将100作为内核参数传递(编译时未知)?似乎不太可能。不管怎么说,它还是试过了,但没有改变任何东西:)也许编译器正在执行它的一项任务