Memory CUDA:指向中间共享内存位置的指针意外行为
我正在启动一个包含512个线程的线性块的内核。与每个线程相关联的是六个双精度值(两个3元素向量),我希望存储在共享内存中,总共512*6*8=24576字节。我想创建指向shared的中间元素的指针,将所有向量按如下方式排列:Memory CUDA:指向中间共享内存位置的指针意外行为,memory,cuda,global,Memory,Cuda,Global,我正在启动一个包含512个线程的线性块的内核。与每个线程相关联的是六个双精度值(两个3元素向量),我希望存储在共享内存中,总共512*6*8=24576字节。我想创建指向shared的中间元素的指针,将所有向量按如下方式排列: __global__ void my_kernel(double *global_data) { extern __shared__ double shr[]; id = threadIdx.x; double *X = &shr[id*
__global__ void my_kernel(double *global_data) {
extern __shared__ double shr[];
id = threadIdx.x;
double *X = &shr[id*3];
double *Y = &shr[(id+1)*3];
// Some arithmetic to set X[0:3] ad Y[0:3]
// Now I have a small for loop to compute something for each thread
for (int i = 0; i < 3; i++) {
for (int j=0; j < 3; j++) {
// Some computations involving the X and Y vectors
}
}
我认为这是正常的。但是:
(cuda-gdb) p i == 0
$7 = true
(cuda-gdb) p X[i]
Error: Failed to read global memory at address 0x0 on device 0 sm 0 warp 0 lane 0 (error=7).
为什么当i==0时,我可以访问X[0],但不能访问X[i]
编辑:下面是一个完整的工作示例,演示了我的问题:
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.compiler import SourceModule
from math import pi
mydat = np.arange(12).astype(np.float64)
mydat_gpu = gpuarray.to_gpu(mydat)
mod = SourceModule("""
__global__ void my_kernel(double *mydat) {
extern __shared__ double shr[];
int id = threadIdx.x;
double *X = &shr[(id * 6)];
double *Y = &shr[(id * 6) + 3];
X[0] = mydat[0];
X[1] = mydat[1];
X[2] = mydat[2];
Y[0] = mydat[3];
Y[1] = mydat[4];
Y[2] = mydat[5];
__syncthreads();
double result;
for (int i = 0; i < 3; i++) {
result += X[i] + Y[i];
}
}
""")
my_kernel = mod.get_function("my_kernel")
blk = (1,1,1)
grd = (1,1,1)
my_kernel(mydat_gpu, grid=grd, block=blk, shared=(8*6))
将pycuda.gpuarray导入为gpuarray
将pycuda.driver导入为cuda
导入pycuda.autoinit
将numpy作为np导入
从pycuda.compiler导入SourceModule
从数学导入pi
mydat=np.arange(12).aType(np.64)
mydat_gpu=gpuarray.to_gpu(mydat)
mod=SourceModule(“”)
__全局\作废我的\内核(双*mydat){
外部共享双shr[];
int id=threadIdx.x;
双*X=&shr[(id*6)];
双*Y=&shr[(id*6)+3];
X[0]=mydat[0];
X[1]=mydat[1];
X[2]=mydat[2];
Y[0]=mydat[3];
Y[1]=mydat[4];
Y[2]=mydat[5];
__同步线程();
双重结果;
对于(int i=0;i<3;i++){
结果+=X[i]+Y[i];
}
}
""")
my_kernel=mod.get_函数(“my_kernel”)
blk=(1,1,1)
grd=(1,1,1)
my_内核(mydat_gpu,grid=grd,block=blk,shared=(8*6))
此时,我启动了一个调试会话:
cuda-gdb --args python -m pycuda.debug minimal_working_example.py
(cuda-gdb) b my_kernel
Function "my_kernel" not defined.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 1 (my_kernel) pending.
(cuda-gdb) run
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
Breakpoint 1, my_kernel(double * @generic)<<<(1,1,1),(1,1,1)>>> (mydat=0x13034a0000)
at kernel.cu:5
5 int id = threadIdx.x;
(cuda-gdb) n
7 double *X = &shr[(id * 6)];
(cuda-gdb) p id
$1 = 0
(cuda-gdb) p id * 6
$2 = 0
(cuda-gdb) n
8 double *Y = &shr[(id * 6) + 3];
(cuda-gdb) p (id * 6) + 3
$3 = 3
(cuda-gdb) n
10 X[0] = mydat[0];
(cuda-gdb) n
11 X[1] = mydat[1];
(cuda-gdb) n
12 X[2] = mydat[2];
(cuda-gdb) n
13 Y[0] = mydat[3];
(cuda-gdb) n
14 Y[1] = mydat[4];
(cuda-gdb) n
15 Y[2] = mydat[5];
(cuda-gdb) p X
$4 = (@generic double * @register) 0x1000000
(cuda-gdb) p X[0]
$5 = 0
(cuda-gdb) p X[1]
$6 = 1
(cuda-gdb) p Y[0]
$7 = 3
(cuda-gdb) p Y[1]
$8 = 4
(cuda-gdb) n
18 __syncthreads();
(cuda-gdb) n
22 for (int i = 0; i < 3; i++) {
(cuda-gdb) n
23 result += X[i] + Y[i];
(cuda-gdb) p i
$9 = 0
(cuda-gdb) p X[0]
$10 = 0
(cuda-gdb) p X[i]
Error: Failed to read global memory at address 0x0 on device 0 sm 0 warp 0 lane 0 (error=7).
cuda gdb--args python-m pycuda.debug minimal_working_example.py
(cuda gdb)b我的内核
函数“my_kernel”未定义。
是否在将来加载共享库时使断点挂起?(y或[n])y
断点1(my_内核)挂起。
(cuda gdb)运行
[将焦点切换到CUDA内核0、网格1、块(0,0,0)、线程(0,0,0)、设备0、sm 0、扭曲0、通道0]
断点1,my_内核(double*@generic)(mydat=0x13034a0000)
在内核。cu:5
5 int id=threadIdx.x;
(cuda gdb)n
7双*X=&shr[(id*6)];
(cuda gdb)p id
$1 = 0
(cuda gdb)p id*6
$2 = 0
(cuda gdb)n
8双*Y=&shr[(id*6)+3];
(cuda gdb)p(id*6)+3
$3 = 3
(cuda gdb)n
10 X[0]=mydat[0];
(cuda gdb)n
11 X[1]=mydat[1];
(cuda gdb)n
12 X[2]=mydat[2];
(cuda gdb)n
13 Y[0]=mydat[3];
(cuda gdb)n
14 Y[1]=mydat[4];
(cuda gdb)n
15 Y[2]=mydat[5];
(cuda gdb)Px
$4=(@generic double*@寄存器)0x1000000
(cuda gdb)Px[0]
$5 = 0
(cuda gdb)Px[1]
$6 = 1
(cuda gdb)Py[0]
$7 = 3
(cuda gdb)Py[1]
$8 = 4
(cuda gdb)n
18个同步线程();
(cuda gdb)n
22表示(int i=0;i<3;i++){
(cuda gdb)n
23结果+=X[i]+Y[i];
(cuda gdb)p i
$9 = 0
(cuda gdb)Px[0]
$10 = 0
(cuda gdb)Px[i]
错误:无法读取设备0 sm 0 0 0通道0上地址0x0处的全局内存(错误=7)。
这里发生的一切是,您正在单步执行尚未编译到正在运行的内核中的源指令。您试图检查的变量已超出范围,调试器无法再向您显示它们
这是由于在设备代码编译器中进行了积极的优化。在您的示例中,求和循环不会产生影响写入全局或共享内存的输出,因此编译器只是将其消除。在逐步执行优化的代码时,源调试器会尽最大努力显示源代码和executi之间的1:1关系上,但这并不总是可能的,这是您看到的有点混乱的结果
您可以通过使用nvcc将内核代码编译到PTX并检查代码来确认这一点:
// .globl _Z9my_kernelPd
.visible .entry _Z9my_kernelPd(
.param .u64 _Z9my_kernelPd_param_0
)
{
.reg .b32 %r<3>;
.reg .f64 %fd<7>;
.reg .b64 %rd<6>;
ld.param.u64 %rd1, [_Z9my_kernelPd_param_0];
cvta.to.global.u64 %rd2, %rd1;
mov.u32 %r1, %tid.x;
mul.lo.s32 %r2, %r1, 6;
mul.wide.s32 %rd3, %r2, 8;
mov.u64 %rd4, shr;
add.s64 %rd5, %rd4, %rd3;
ld.global.nc.f64 %fd1, [%rd2];
ld.global.nc.f64 %fd2, [%rd2+8];
ld.global.nc.f64 %fd3, [%rd2+16];
ld.global.nc.f64 %fd4, [%rd2+24];
ld.global.nc.f64 %fd5, [%rd2+32];
ld.global.nc.f64 %fd6, [%rd2+40];
st.shared.f64 [%rd5], %fd1;
st.shared.f64 [%rd5+8], %fd2;
st.shared.f64 [%rd5+16], %fd3;
st.shared.f64 [%rd5+24], %fd4;
st.shared.f64 [%rd5+32], %fd5;
st.shared.f64 [%rd5+40], %fd6;
bar.sync 0;
ret;
}
因此,结果
现在存储到全局内存中,并将其编译为PTX:
.visible .entry _Z10my_kernel2PdS_(
.param .u64 _Z10my_kernel2PdS__param_0,
.param .u64 _Z10my_kernel2PdS__param_1
)
{
.reg .b32 %r<3>;
.reg .f64 %fd<20>;
.reg .b64 %rd<8>;
ld.param.u64 %rd3, [_Z10my_kernel2PdS__param_0];
ld.param.u64 %rd2, [_Z10my_kernel2PdS__param_1];
cvta.to.global.u64 %rd4, %rd3;
mov.u32 %r1, %tid.x;
mul.lo.s32 %r2, %r1, 6;
mul.wide.s32 %rd5, %r2, 8;
mov.u64 %rd6, shr;
add.s64 %rd1, %rd6, %rd5;
ld.global.f64 %fd1, [%rd4];
ld.global.f64 %fd2, [%rd4+8];
ld.global.f64 %fd3, [%rd4+16];
ld.global.f64 %fd4, [%rd4+24];
ld.global.f64 %fd5, [%rd4+32];
ld.global.f64 %fd6, [%rd4+40];
st.shared.f64 [%rd1], %fd1;
st.shared.f64 [%rd1+8], %fd2;
st.shared.f64 [%rd1+16], %fd3;
st.shared.f64 [%rd1+24], %fd4;
st.shared.f64 [%rd1+32], %fd5;
st.shared.f64 [%rd1+40], %fd6;
bar.sync 0;
ld.shared.f64 %fd7, [%rd1];
ld.shared.f64 %fd8, [%rd1+24];
add.f64 %fd9, %fd7, %fd8;
add.f64 %fd10, %fd9, %fd11;
ld.shared.f64 %fd12, [%rd1+8];
ld.shared.f64 %fd13, [%rd1+32];
add.f64 %fd14, %fd12, %fd13;
add.f64 %fd15, %fd10, %fd14;
ld.shared.f64 %fd16, [%rd1+16];
ld.shared.f64 %fd17, [%rd1+40];
add.f64 %fd18, %fd16, %fd17;
add.f64 %fd19, %fd15, %fd18;
cvta.to.global.u64 %rd7, %rd2;
st.global.f64 [%rd7], %fd19;
ret;
}
.visible.entry\u Z10my\u内核2PDS_(
.param.u64 _Z10my _kernel2PdS u param_0,
.param.u64_Z10my_内核2PDS__参数1
)
{
.注册号b32%r;
.reg.f64%fd;
.reg.b64%rd;
ld.param.u64%rd3,[\u Z10my\u kernel2PdS\u param\u 0];
ld.param.u64%rd2,[\u Z10my\u内核2pds\u param\u 1];
cvta.to.global.u64%rd4,%rd3;
mov.u32%r1,%tid.x;
mul.lo.s32%r2,%r1,6;
mul.wide.s32%rd5,%r2,8;
mov.u64%rd6,shr;
add.s64%rd1、%rd6、%rd5;
ld.global.f64%fd1,[%rd4];
ld.global.f64%fd2,[%rd4+8];
ld.global.f64%fd3,[%rd4+16];
ld.global.f64%fd4,[%rd4+24];
ld.global.f64%fd5,[%rd4+32];
ld.global.f64%fd6,[%rd4+40];
st.shared.f64[%rd1],%fd1;
st.shared.f64[%rd1+8],%fd2;
st.shared.f64[%rd1+16],%fd3;
st.shared.f64[%rd1+24],%fd4;
st.shared.f64[%rd1+32],%fd5;
st.shared.f64[%rd1+40],%fd6;
bar.sync 0;
ld.shared.f64%fd7,[%rd1];
ld.shared.f64%fd8,[%rd1+24];
add.f64%fd9、%fd7、%fd8;
add.f64%fd10、%fd9、%fd11;
ld.shared.f64%fd12,[%rd1+8];
ld.shared.f64%fd13,[%rd1+32];
add.f64%fd14、%fd12、%fd13;
add.f64%fd15、%fd10、%fd14;
ld.shared.f64%fd16,[%rd1+16];
ld.shared.f64%fd17,[%rd1+40];
add.f64%fd18、%fd16、%fd17;
add.f64%fd19、%fd15、%fd18;
cvta.to.global.u64%rd7,%rd2;
st.global.f64[%rd7],%fd19;
ret;
}
您可以看到(urolled)循环现在出现在PTX中,如果您尝试它,调试器的行为应该更接近您所期望的
正如评论中所建议的那样,由于编译器优化带来的复杂性,您不应该花时间尝试分析任何不改变块或全局状态的代码。;无效CUDA C/C++。抱歉@RobertCrovella我昨晚有点匆忙地键入了这一点-在产生问题的代码中,它被声明为
extern\uuuuu shared\uuuuuuu double shr[];
我正在更新OP以反映这一点。一般来说,很难解释运行时行为
__global__ void my_kernel2(double *mydat, double *out) {
extern __shared__ double shr[];
int id = threadIdx.x;
double *X = &shr[(id * 6)];
double *Y = &shr[(id * 6) + 3];
X[0] = mydat[0];
X[1] = mydat[1];
X[2] = mydat[2];
Y[0] = mydat[3];
Y[1] = mydat[4];
Y[2] = mydat[5];
__syncthreads();
double result;
for (int i = 0; i < 3; i++) {
result += X[i] + Y[i];
}
*out = result;
}
.visible .entry _Z10my_kernel2PdS_(
.param .u64 _Z10my_kernel2PdS__param_0,
.param .u64 _Z10my_kernel2PdS__param_1
)
{
.reg .b32 %r<3>;
.reg .f64 %fd<20>;
.reg .b64 %rd<8>;
ld.param.u64 %rd3, [_Z10my_kernel2PdS__param_0];
ld.param.u64 %rd2, [_Z10my_kernel2PdS__param_1];
cvta.to.global.u64 %rd4, %rd3;
mov.u32 %r1, %tid.x;
mul.lo.s32 %r2, %r1, 6;
mul.wide.s32 %rd5, %r2, 8;
mov.u64 %rd6, shr;
add.s64 %rd1, %rd6, %rd5;
ld.global.f64 %fd1, [%rd4];
ld.global.f64 %fd2, [%rd4+8];
ld.global.f64 %fd3, [%rd4+16];
ld.global.f64 %fd4, [%rd4+24];
ld.global.f64 %fd5, [%rd4+32];
ld.global.f64 %fd6, [%rd4+40];
st.shared.f64 [%rd1], %fd1;
st.shared.f64 [%rd1+8], %fd2;
st.shared.f64 [%rd1+16], %fd3;
st.shared.f64 [%rd1+24], %fd4;
st.shared.f64 [%rd1+32], %fd5;
st.shared.f64 [%rd1+40], %fd6;
bar.sync 0;
ld.shared.f64 %fd7, [%rd1];
ld.shared.f64 %fd8, [%rd1+24];
add.f64 %fd9, %fd7, %fd8;
add.f64 %fd10, %fd9, %fd11;
ld.shared.f64 %fd12, [%rd1+8];
ld.shared.f64 %fd13, [%rd1+32];
add.f64 %fd14, %fd12, %fd13;
add.f64 %fd15, %fd10, %fd14;
ld.shared.f64 %fd16, [%rd1+16];
ld.shared.f64 %fd17, [%rd1+40];
add.f64 %fd18, %fd16, %fd17;
add.f64 %fd19, %fd15, %fd18;
cvta.to.global.u64 %rd7, %rd2;
st.global.f64 [%rd7], %fd19;
ret;
}