Cuda共享内存错误_Cuda_Parallel Processing

Cuda共享内存错误

cuda parallel-processing

Cuda共享内存错误,cuda,parallel-processing,Cuda,Parallel Processing,我有一个简单的基数排序（它只按一位和一个块排序）。我的第一个版本可以工作，但我尝试先对共享内存上的键进行排序，以实现对DRAM的合并内存写入。然而，这个版本产生了不好的结果，它没有排序首先，简单的工作版本： __global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){ int tid=threadIdx.x; //G

我有一个简单的基数排序（它只按一位和一个块排序）。我的第一个版本可以工作，但我尝试先对共享内存上的键进行排序，以实现对DRAM的合并内存写入。然而，这个版本产生了不好的结果，它没有排序

首先，简单的工作版本：

__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
int tid=threadIdx.x;

//Get offset by using prefix sum scan.
__shared__ unsigned int s_sum[1024];
unsigned int first=((in_keys[tid]>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; pos>=0; off=off*2, pos=tid-off){
    int a=s_sum[pos];
    int b=s_sum[tid];
    __syncthreads();
    s_sum[tid]=a+b;
}
__syncthreads();

int offset=s_sum[tid]-first;
if (first==0){
    //Get offset for '1' bit keys
    offset=histo[0]+tid-offset;
}

out_keys[offset]=in_keys[tid];

}

第二版：

__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
int tid=threadIdx.x;

//Get offset by using prefix sum scan.
__shared__ unsigned int s_sum[1024];
unsigned int first=((in_keys[tid]>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; pos>=0; off=off*2, pos=tid-off){
    int a=s_sum[pos];
    int b=s_sum[tid];
    __syncthreads();
    s_sum[tid]=a+b;
}
__syncthreads();

int offset=s_sum[tid]-first;
if (first==0){
    //Get offset for '1' bit keys
    offset=histo[0]+tid-offset;
}

__syncthreads();
s_sum[offset]=in_keys[tid];
__syncthreads();
out_keys[tid]=s_sum[tid];

}问题是我在条件代码上调用了u syncthreads（）。只允许对块中所有线程具有相同执行路径的条件代码调用_syncthreads（）。正确版本：

__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
__shared__ unsigned int s_sum[1024];
int tid=threadIdx.x;

//Get offset by using prefix sum scan.
unsigned int v=in_keys[tid];
unsigned int first=((v>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; off<1024;){
    int a,b;
    if (pos>=0){
        a=s_sum[pos];
        b=s_sum[tid];
    }
    __syncthreads();
    if (pos>=0){
        s_sum[tid]=a+b;
    }
    __syncthreads();
    off=off*2;
    pos=tid-off;
}
__syncthreads();

int offset=s_sum[tid]-first;
if (first==0){
    //Get offset for '1' bit keys
    offset=histo[0]+tid-offset;
}
__syncthreads();
s_sum[offset]=v;
__syncthreads();
out_keys[tid]=s_sum[tid];

\uuuuu全局\uuuuuu无效偏差基数（无符号整数*输入键、常量无符号整数*历史、无符号整数解压缩、无符号整数*输出键）{
__共享的无符号整数s_和[1024]；
int tid=threadIdx.x；
//使用前缀和扫描获取偏移量。
无符号整数v=in_键[tid]；
无符号int first=（（v>>desp）&1）==0；
sum[tid]=第一；
__同步线程（）；
int pos=tid-1；
对于（int off=1；off=0）{
a=s_和[pos]；
b=s_和[tid]；
}
__同步线程（）；
如果（位置>=0）{
sum[tid]=a+b；
}
__同步线程（）；
off=off*2；
pos=tid off；
}
__同步线程（）；
int offset=s_和[tid]-第一；
如果（第一个==0）{
//获取“1”位键的偏移量
偏移量=历史[0]+tid偏移量；
}
__同步线程（）；
s_和[偏移量]=v；
__同步线程（）；
out_keys[tid]=s_sum[tid]；

}

如果没有一个完整的重新编译案例和对问题的更具体描述，那么很难说出您的代码可能有什么问题，那么“不排序”我不是Cuda专家，但这是一个4行差异问题，我认为这不太难。