CUDA：这是共享内存的正确使用吗？_Cuda

CUDA：这是共享内存的正确使用吗？

cuda

CUDA：这是共享内存的正确使用吗？,cuda,Cuda,我有以下CUDA内核： __global__ void combine_kernel(const uint64_t* __restrict__ d_D1,unsigned int hashedFrameNumber) { //1D GRID OF 1D BLOCKS int tid = threadIdx.x + blockDim.x * blockIdx.x; // SIZE = 2118760 if (tid <

我有以下CUDA内核：

__global__ void combine_kernel(const uint64_t* __restrict__ d_D1,unsigned int hashedFrameNumber)
    {
        //1D GRID OF 1D BLOCKS
        int tid = threadIdx.x + blockDim.x * blockIdx.x;
        // SIZE = 2118760
        if (tid < 2118760) 
        {

          __shared__ uint64_t d_D11[10];

          for(int i = threadIdx.x; i < 10; i++)
          {
           d_D11[i] = d_D1[i];    
          }
          __syncthreads();

          //POINT A
          //check if d_D1 is unshuffled
          if(tid == SIZE-1)
          {
            printf("%llx \n",d_D1[0]);
            printf("%llx \n",d_D1[1]);
            printf("%llx \n",d_D1[2]);
            printf("%llx \n",d_D1[3]);
            printf("%llx \n",d_D1[4]);
            printf("%llx \n",d_D1[5]);
            printf("%llx \n",d_D1[6]);
            printf("%llx \n",d_D1[7]);
            printf("%llx \n",d_D1[8]);
            printf("%llx \n",d_D1[9]);
            printf("\n\n");    
          }

          //POINT B
          //check if shared d_D11 is unshuffled
          if(tid == SIZE-1)
          {
            printf("%llx \n",d_D11[0]);
            printf("%llx \n",d_D11[1]);
            printf("%llx \n",d_D11[2]);
            printf("%llx \n",d_D11[3]);
            printf("%llx \n",d_D11[4]);
            printf("%llx \n",d_D11[5]);
            printf("%llx \n",d_D11[6]);
            printf("%llx \n",d_D11[7]);
            printf("%llx \n",d_D11[8]);
            printf("%llx \n",d_D11[9]);
            printf("\n\n");    
          }

         curandState randState;
         curand_init(hashedFrameNumber, 0, 0, &randState);

         if(threadIdx.x == 0)
         {       
          for (unsigned int i = 9; i > 0; i--)
          {
           size_t j = (unsigned int) (((curand(&randState) / 32768)*(i+1)) % 10);
           uint64_t t0 = d_D11[j];
           d_D11[j] = d_D11[i];
           d_D11[i] = t0;     
          }
         }
         __syncthreads();

         //POINT C
         //check if d_D1 is shuffled
          if(tid == SIZE-1)
          {
            printf("%llx \n",d_D1[0]);
            printf("%llx \n",d_D1[1]);
            printf("%llx \n",d_D1[2]);
            printf("%llx \n",d_D1[3]);
            printf("%llx \n",d_D1[4]);
            printf("%llx \n",d_D1[5]);
            printf("%llx \n",d_D1[6]);
            printf("%llx \n",d_D1[7]);
            printf("%llx \n",d_D1[8]);
            printf("%llx \n",d_D1[9]);
            printf("\n\n");    
          }


         __syncthreads();

        }
    }

\uuuuu全局\uuuuuu无效合并\uu内核（const uint64\u t*\uuuuu restrict\uuuuu d\u D1，无符号整数哈希帧号）
{
//一维块的一维网格
int tid=threadIdx.x+blockDim.x*blockIdx.x；
//尺寸=2118760
如果（tid<2118760）
{
__共享数据集64\u t d\u D11[10]；
对于（int i=threadIdx.x；i<10；i++）
{
d_D11[i]=d_D1[i]；
}
__同步线程（）；
//A点
//检查d_D1是否未缓冲
如果（tid==尺寸-1）
{
printf（“%llx\n”，d_D1[0]）；
printf（“%llx\n”，d_D1[1]）；
printf（“%llx\n”，d_D1[2]）；
printf（“%llx\n”，d_D1[3]）；
printf（“%llx\n”，d_D1[4]）；
printf（“%llx\n”，d_D1[5]）；
printf（“%llx\n”，d_D1[6]）；
printf（“%llx\n”，d_D1[7]）；
printf（“%llx\n”，d_D1[8]）；
printf（“%llx\n”，d_D1[9]）；
printf（“\n\n”）；
}
//B点
//检查共享d_D11是否未缓冲
如果（tid==尺寸-1）
{
printf（“%llx\n”，d_D11[0]）；
printf（“%llx\n”，d_D11[1]）；
printf（“%llx\n”，d_D11[2]）；
printf（“%llx\n”，d_D11[3]）；
printf（“%llx\n”，d_D11[4]）；
printf（“%llx\n”，d_D11[5]）；
printf（“%llx\n”，d_D11[6]）；
printf（“%llx\n”，d_D11[7]）；
printf（“%llx\n”，d_D11[8]）；
printf（“%llx\n”，d_D11[9]）；
printf（“\n\n”）；
}
库兰州立大学；
curand_init（hashedFrameNumber、0、0和randState）；
if（threadIdx.x==0）
{       
对于（无符号整数i=9；i>0；i--）
{
size_t j=（无符号整数）（（curand（&randState）/32768）*（i+1））%10）；
uint64_t t0=d_D11[j]；
d_D11[j]=d_D11[i]；
d_D11[i]=t0；
}
}
__同步线程（）；
//C点
//检查d_D1是否被洗牌
如果（tid==尺寸-1）
{
printf（“%llx\n”，d_D1[0]）；
printf（“%llx\n”，d_D1[1]）；
printf（“%llx\n”，d_D1[2]）；
printf（“%llx\n”，d_D1[3]）；
printf（“%llx\n”，d_D1[4]）；
printf（“%llx\n”，d_D1[5]）；
printf（“%llx\n”，d_D1[6]）；
printf（“%llx\n”，d_D1[7]）；
printf（“%llx\n”，d_D1[8]）；
printf（“%llx\n”，d_D1[9]）；
printf（“\n\n”）；
}
__同步线程（）；
}
}

发生的情况是，当我检查d_D11是否在点B处未缓冲时，如果tid在0和31之间，它将未缓冲，否则将被缓冲，那么我做错了什么？这是使用共享内存的正确方法吗

d_D1包含10个元素。我只想将数组d_D1的10个元素传递给共享数组d_D11，然后旋转共享数组并使用它。

在无序代码之前需要一个

\u syncthreads（）

屏障，以防止之前的printf（）打印无序或部分无序的值，再次在C点之前，以确保在打印结果时洗牌已完成

除了这个正确性问题之外，您的代码不必要地多次初始化某些元素

像这样初始化共享内存，假设块是一维的，并且每个块至少有10个线程：

      __shared__ uint64_t d_D11[10];
      unsigned int tidx = threadIdx.x;

      if (tidx < 10) {
          d_D11[tidx] = d_D1[tidx];    
      }
      __syncthreads();

\uuuuuuuuuuuuuuuuuuuuuuuuuuud11[10]；
unsigned int tidx=threadIdx.x；
如果（tidx<10）{
d_D11[tidx]=d_D1[tidx]；
}
__同步线程（）；

如果共享数组的元素可能多于线程，则可以使用以下习惯用法：

      const int N = ...
      __shared__ uint64_t d_D11[N];

      for(int i = threadIdx.x; i < N; i += blockDim.x)
          d_D11[i] = d_D1[i];    
      }
      __syncthreads();

const int N=。。。
__共享数据集64\u t d\u D11[N]；
对于（int i=threadIdx.x；i

块是一维的，有10个值，所以我想你的第一个代码是对的，但是洗牌呢？我没有用printfHi得到正确的结果，我已经用_syncthreads尝试过了，我只是忘了在代码中包含它，但仍然不起作用。执行洗牌时（threadIdx.x==0）是否正确？在B点，我让数组d_D11在应该取消缓冲的时候被洗牌，在C点，它被洗牌，因为它应该再次更新。感谢Talonmes通过编辑问题指出问题。