在共享内存中移动数据时执行CUDA线程

在共享内存中移动数据时执行CUDA线程,cuda,gpu,Cuda,Gpu,我有以下功能: 代码示例1: __global__ void func(const int *input, int N){ extern __shared__int buffer[]; int temp = 0; for(int i = blockIdx.x*blockDim.x + threadIdx.x; i < N; i += blockDim.x*gridDim.x; ){ temp += input[i]; } buffer[threadIdx

我有以下功能:

代码示例1:

__global__ void func(const int *input, int N){

  extern __shared__int buffer[];
  int temp = 0;

  for(int i = blockIdx.x*blockDim.x + threadIdx.x; i < N; i += blockDim.x*gridDim.x; ){
     temp += input[i];
  }

  buffer[threadIdx.x] = temp;
  __syncthreads();

} 
blockIdx.x=0
threadIdx.x=0
for(i=0; i<18; i+= 4*5){ temp= in[0] /i wrote the sums intuitively/}        
buffer[threadIdx.x] = temp

blockIdx.x=0
threadIdx.x=1
for(i=1; i<18; i+= 4*5){ temp= in[1] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp

blockIdx.x=0
threadIdx.x=2
for(i=2; i<18; i+= 4*5){ temp= in[2] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp

blockIdx.x=0
threadIdx.x=3
for(i=3; i<18; i+= 4*5){ temp= in[3] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp
blockIdx.x=1
threadIdx.x=0
for(i=1*4; i<18; i+= 4*5){ temp= in[4] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp

blockIdx.x=1
threadIdx.x=1
for(i=1*4+1; i<18; i+= 4*5){ temp = in[5] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp

blockIdx.x=1
threadIdx.x=2
for(i=1*4+2; i<18; i+= 4*5){ temp = in[6] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp

blockIdx.x=1
threadIdx.x=3
for(i=1*4+3; i<18; i+= 4*5){ temp = in[7] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
buffer[threadIdx.x] = input[i]; 

有人能给出一个直观的例子或解释吗?

在主函数中,索引大于
blockDim.x*(gridDim.x-1)+(blockDim.x-1)
的元素将被考虑到计算中,而在您提供的方法中不会发生这种情况

假设您有
N=1024
,并且您使用一个网格调用您的函数,该网格有8个块,每个块有32个线程。在主函数中,线程
i
将在元素
i
i+8*32
i+2*(8*32)
i+3*(8*32)
处收集并添加属于
*输入的数据。另一方面,代码仅在元素
i
处收集数据。换句话说,它只将
32*8
输入的
*input
的第一个元素相加,而忽略
1024-32*8
其余元素

更详细的信息:

__global__ void func(const int *input, int N){

  extern __shared__int buffer[];
  int temp = 0;

  for(int i = blockIdx.x*blockDim.x + threadIdx.x; i < N; i += blockDim.x*gridDim.x; ){
     temp += input[i];
  }

  buffer[threadIdx.x] = temp;
  __syncthreads();

} 
blockIdx.x=0
threadIdx.x=0
for(i=0; i<18; i+= 4*5){ temp= in[0] /i wrote the sums intuitively/}        
buffer[threadIdx.x] = temp

blockIdx.x=0
threadIdx.x=1
for(i=1; i<18; i+= 4*5){ temp= in[1] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp

blockIdx.x=0
threadIdx.x=2
for(i=2; i<18; i+= 4*5){ temp= in[2] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp

blockIdx.x=0
threadIdx.x=3
for(i=3; i<18; i+= 4*5){ temp= in[3] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp
blockIdx.x=1
threadIdx.x=0
for(i=1*4; i<18; i+= 4*5){ temp= in[4] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp

blockIdx.x=1
threadIdx.x=1
for(i=1*4+1; i<18; i+= 4*5){ temp = in[5] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp

blockIdx.x=1
threadIdx.x=2
for(i=1*4+2; i<18; i+= 4*5){ temp = in[6] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp

blockIdx.x=1
threadIdx.x=3
for(i=1*4+3; i<18; i+= 4*5){ temp = in[7] /i wrote the sums intuitively/}
buffer[threadIdx.x] = temp
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
buffer[threadIdx.x] = input[i]; 
代码exmaple 1的工作原理如下:

blockIdx.x=0
threadIdx.x=0
for ( i = 0; i < 1024; i += 32*8 )
    temp += input[i]; // temp= input[0]+input[256]+input[512]+input[768]
buffer[0] = temp; //=input[0]+input[256]+input[512]+input[768]

blockIdx.x=0
threadIdx.x=1
for ( i = 1; i < 1024; i += 32*8 )
    temp += input[i]; // temp= input[1]+input[257]+input[513]+input[769]
buffer[1] = temp; //=input[1]+input[257]+input[513]+input[769]

blockIdx.x=0
threadIdx.x=2
for ( i = 2; i < 1024; i += 32*8 )
    temp += input[i]; // temp= input[2]+input[258]+input[514]+input[770]
buffer[2] = temp; //=input[2]+input[258]+input[514]+input[770]
blockIdx.x=0
threadIdx.x=0
i = 0*32+0; //=0
buffer[0] = input[0];

blockIdx.x=0
threadIdx.x=1
i = 0*32+1; //=1
buffer[1] = input[1];


正如您所看到的第一个代码示例,检查了
input
数组的所有元素,但第二个代码exmaple没有。

当您说
blockDim.x*(gridDim.x-1)+(blockDim.x-1)
时,您指的代码的哪一部分?你能给我更详细的解释吗?(这部分代码来自一本书)。你的意思是我给出的示例是错误的还是代码是错误的?在主代码中,在
for
循环的
处每个线程的
I
初始化是
blockIdx.x*blockDim.x+threadIdx.x
blockDim.x
的最大值可以是
gridDim.x-1
threadIdx.x
的最大值可以是
blockDim.x-1
。如果
N
大于
blockDim.x*(gridDim.x-1)+(blockDim.x-1)
,则您提供的最后一段代码无法正常工作。如果您参考代码示例2,说出最后一段代码,我想我理解您的意思,如果您的意思是,由于线程限制,我们选择
作为
循环。我只是觉得使用
for
循环有点奇怪,因为它只有一次执行。换句话说,你的意思是,如果我们有很多元素,for循环是有意义的?只有这样才能执行多次?抱歉我的无知,我是cuda领域的新手。是的,当我说最后一段代码时,我指的是代码exmaple 2。对如果我们有很多元素,并且我们有有限数量的块和有限的大小,使用“for”循环可以是一个解决方案。