CUDA:这是共享内存的正确使用吗?

CUDA:这是共享内存的正确使用吗?,cuda,Cuda,我有以下CUDA内核: __global__ void combine_kernel(const uint64_t* __restrict__ d_D1,unsigned int hashedFrameNumber) { //1D GRID OF 1D BLOCKS int tid = threadIdx.x + blockDim.x * blockIdx.x; // SIZE = 2118760 if (tid <

我有以下CUDA内核:

__global__ void combine_kernel(const uint64_t* __restrict__ d_D1,unsigned int hashedFrameNumber)
    {
        //1D GRID OF 1D BLOCKS
        int tid = threadIdx.x + blockDim.x * blockIdx.x;
        // SIZE = 2118760
        if (tid < 2118760) 
        {

          __shared__ uint64_t d_D11[10];

          for(int i = threadIdx.x; i < 10; i++)
          {
           d_D11[i] = d_D1[i];    
          }
          __syncthreads();

          //POINT A
          //check if d_D1 is unshuffled
          if(tid == SIZE-1)
          {
            printf("%llx \n",d_D1[0]);
            printf("%llx \n",d_D1[1]);
            printf("%llx \n",d_D1[2]);
            printf("%llx \n",d_D1[3]);
            printf("%llx \n",d_D1[4]);
            printf("%llx \n",d_D1[5]);
            printf("%llx \n",d_D1[6]);
            printf("%llx \n",d_D1[7]);
            printf("%llx \n",d_D1[8]);
            printf("%llx \n",d_D1[9]);
            printf("\n\n");    
          }

          //POINT B
          //check if shared d_D11 is unshuffled
          if(tid == SIZE-1)
          {
            printf("%llx \n",d_D11[0]);
            printf("%llx \n",d_D11[1]);
            printf("%llx \n",d_D11[2]);
            printf("%llx \n",d_D11[3]);
            printf("%llx \n",d_D11[4]);
            printf("%llx \n",d_D11[5]);
            printf("%llx \n",d_D11[6]);
            printf("%llx \n",d_D11[7]);
            printf("%llx \n",d_D11[8]);
            printf("%llx \n",d_D11[9]);
            printf("\n\n");    
          }

         curandState randState;
         curand_init(hashedFrameNumber, 0, 0, &randState);

         if(threadIdx.x == 0)
         {       
          for (unsigned int i = 9; i > 0; i--)
          {
           size_t j = (unsigned int) (((curand(&randState) / 32768)*(i+1)) % 10);
           uint64_t t0 = d_D11[j];
           d_D11[j] = d_D11[i];
           d_D11[i] = t0;     
          }
         }
         __syncthreads();

         //POINT C
         //check if d_D1 is shuffled
          if(tid == SIZE-1)
          {
            printf("%llx \n",d_D1[0]);
            printf("%llx \n",d_D1[1]);
            printf("%llx \n",d_D1[2]);
            printf("%llx \n",d_D1[3]);
            printf("%llx \n",d_D1[4]);
            printf("%llx \n",d_D1[5]);
            printf("%llx \n",d_D1[6]);
            printf("%llx \n",d_D1[7]);
            printf("%llx \n",d_D1[8]);
            printf("%llx \n",d_D1[9]);
            printf("\n\n");    
          }


         __syncthreads();

        }
    }
\uuuuu全局\uuuuuu无效合并\uu内核(const uint64\u t*\uuuuu restrict\uuuuu d\u D1,无符号整数哈希帧号)
{
//一维块的一维网格
int tid=threadIdx.x+blockDim.x*blockIdx.x;
//尺寸=2118760
如果(tid<2118760)
{
__共享数据集64\u t d\u D11[10];
对于(int i=threadIdx.x;i<10;i++)
{
d_D11[i]=d_D1[i];
}
__同步线程();
//A点
//检查d_D1是否未缓冲
如果(tid==尺寸-1)
{
printf(“%llx\n”,d_D1[0]);
printf(“%llx\n”,d_D1[1]);
printf(“%llx\n”,d_D1[2]);
printf(“%llx\n”,d_D1[3]);
printf(“%llx\n”,d_D1[4]);
printf(“%llx\n”,d_D1[5]);
printf(“%llx\n”,d_D1[6]);
printf(“%llx\n”,d_D1[7]);
printf(“%llx\n”,d_D1[8]);
printf(“%llx\n”,d_D1[9]);
printf(“\n\n”);
}
//B点
//检查共享d_D11是否未缓冲
如果(tid==尺寸-1)
{
printf(“%llx\n”,d_D11[0]);
printf(“%llx\n”,d_D11[1]);
printf(“%llx\n”,d_D11[2]);
printf(“%llx\n”,d_D11[3]);
printf(“%llx\n”,d_D11[4]);
printf(“%llx\n”,d_D11[5]);
printf(“%llx\n”,d_D11[6]);
printf(“%llx\n”,d_D11[7]);
printf(“%llx\n”,d_D11[8]);
printf(“%llx\n”,d_D11[9]);
printf(“\n\n”);
}
库兰州立大学;
curand_init(hashedFrameNumber、0、0和randState);
if(threadIdx.x==0)
{       
对于(无符号整数i=9;i>0;i--)
{
size_t j=(无符号整数)((curand(&randState)/32768)*(i+1))%10);
uint64_t t0=d_D11[j];
d_D11[j]=d_D11[i];
d_D11[i]=t0;
}
}
__同步线程();
//C点
//检查d_D1是否被洗牌
如果(tid==尺寸-1)
{
printf(“%llx\n”,d_D1[0]);
printf(“%llx\n”,d_D1[1]);
printf(“%llx\n”,d_D1[2]);
printf(“%llx\n”,d_D1[3]);
printf(“%llx\n”,d_D1[4]);
printf(“%llx\n”,d_D1[5]);
printf(“%llx\n”,d_D1[6]);
printf(“%llx\n”,d_D1[7]);
printf(“%llx\n”,d_D1[8]);
printf(“%llx\n”,d_D1[9]);
printf(“\n\n”);
}
__同步线程();
}
}
发生的情况是,当我检查d_D11是否在点B处未缓冲时,如果tid在0和31之间,它将未缓冲,否则将被缓冲,那么我做错了什么?这是使用共享内存的正确方法吗


d_D1包含10个元素。我只想将数组d_D1的10个元素传递给共享数组d_D11,然后旋转共享数组并使用它。

在无序代码之前需要一个
\u syncthreads()
屏障,以防止之前的printf()打印无序或部分无序的值,再次在C点之前,以确保在打印结果时洗牌已完成

除了这个正确性问题之外,您的代码不必要地多次初始化某些元素

像这样初始化共享内存,假设块是一维的,并且每个块至少有10个线程:

      __shared__ uint64_t d_D11[10];
      unsigned int tidx = threadIdx.x;

      if (tidx < 10) {
          d_D11[tidx] = d_D1[tidx];    
      }
      __syncthreads();
\uuuuuuuuuuuuuuuuuuuuuuuuuuud11[10];
unsigned int tidx=threadIdx.x;
如果(tidx<10){
d_D11[tidx]=d_D1[tidx];
}
__同步线程();
如果共享数组的元素可能多于线程,则可以使用以下习惯用法:

      const int N = ...
      __shared__ uint64_t d_D11[N];

      for(int i = threadIdx.x; i < N; i += blockDim.x)
          d_D11[i] = d_D1[i];    
      }
      __syncthreads();
const int N=。。。
__共享数据集64\u t d\u D11[N];
对于(int i=threadIdx.x;i
块是一维的,有10个值,所以我想你的第一个代码是对的,但是洗牌呢?我没有用printfHi得到正确的结果,我已经用_syncthreads尝试过了,我只是忘了在代码中包含它,但仍然不起作用。执行洗牌时(threadIdx.x==0)是否正确?在B点,我让数组d_D11在应该取消缓冲的时候被洗牌,在C点,它被洗牌,因为它应该再次更新。感谢Talonmes通过编辑问题指出问题。