gpugems3中的前缀扫描CUDA示例代码正确吗？_Cuda_Gpu_Nvidia_Prefix Sum

gpugems3中的前缀扫描CUDA示例代码正确吗？

cuda

gpugems3中的前缀扫描CUDA示例代码正确吗？,cuda,gpu,nvidia,prefix-sum,Cuda,Gpu,Nvidia,Prefix Sum,在《GPU Gems 3》一书第39章中，我编写了一段调用内核的代码然而，我得到的结果是一堆负数，而不是前缀扫描我的内核调用是错误的还是GPU Gems 3书中的代码有问题这是我的密码： #包括 #包括 #包括 __全局无效内核（int*g\u-odata，int*g\u-idata，int-n，int-dim） { extern _ushared _uuint temp[]；//在调用时分配 int thid=threadIdx.x； int offset=1； temp[2*thid]

在《GPU Gems 3》一书第39章中，我编写了一段调用内核的代码

然而，我得到的结果是一堆负数，而不是前缀扫描

我的内核调用是错误的还是GPU Gems 3书中的代码有问题

这是我的密码：

#包括
#包括
#包括
__全局无效内核（int*g\u-odata，int*g\u-idata，int-n，int-dim）
{
extern _ushared _uuint temp[]；//在调用时分配
int thid=threadIdx.x；
int offset=1；
temp[2*thid]=g_idata[2*thid]；//将输入加载到共享内存中
温度[2*thid+1]=g_idata[2*thid+1]；
对于（int d=n>>1；d>0；d>>=1）//在树上就地构建sum
{
__同步线程（）；
if（thid>=1；
__同步线程（）；
if（thid对于（j=0；j而言，在将GPU Gems 3中的代码转录到内核中时，您似乎至少犯了1个错误。此行不正确：
temp[bi] += g_idata[ai];

应该是：
temp[bi] += temp[ai];

当我对您现在发布的代码进行一次更改时，它似乎为我打印出了正确的（独占扫描）前缀和。我还想提及一些其他事项：
即使没有这样的改变，我也会得到一些接近正确的结果。因此，如果你得到的是完全不同的东西（例如负数），你的机器设置或CUDA安装可能会有问题。我建议使用比现在更严格的方法（尽管在您的一项检查中应该指出机器设置问题。）
编制的例程将有一些限制。它只能在单个threadblock中使用，在共享内存访问上会有库冲突，并且它的数据集大小将限制为单个threadblock可以处理的大小（此例程为每个线程生成两个输出元素，因此数据集大小应等于线程数的两倍）。如前所述，动态共享内存分配需要与数据集大小一样大（即，在元素数上为线程大小的两倍）
这对于学习可能很有用，但是如果您想要一个健壮、快速的前缀扫描，建议您使用一个来自或代替您自己的代码的例程，即使是从这篇（旧）文章中派生出来的
下面的代码与您的代码类似，但上面的问题已得到修复，我已将内核模板化以用于各种数据类型：
#include <stdio.h>
#define DSIZE 512
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


typedef int mytype;

template <typename T>
__global__ void prescan(T *g_odata, T *g_idata, int n)
{
  extern __shared__ T temp[];  // allocated on invocation
  int thid = threadIdx.x;
  int offset = 1;
  temp[2*thid] = g_idata[2*thid]; // load input into shared memory
  temp[2*thid+1] = g_idata[2*thid+1];
  for (int d = n>>1; d > 0; d >>= 1)                    // build sum in place up the tree
  {
    __syncthreads();
    if (thid < d)
    {
      int ai = offset*(2*thid+1)-1;
      int bi = offset*(2*thid+2)-1;
      temp[bi] += temp[ai];
    }
    offset *= 2;
  }
  if (thid == 0) { temp[n - 1] = 0; } // clear the last element
  for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
    {
      offset >>= 1;
      __syncthreads();
      if (thid < d)
      {
         int ai = offset*(2*thid+1)-1;
         int bi = offset*(2*thid+2)-1;
         T t = temp[ai];
         temp[ai] = temp[bi];
         temp[bi] += t;
      }
    }
  __syncthreads();
  g_odata[2*thid] = temp[2*thid]; // write results to device memory
  g_odata[2*thid+1] = temp[2*thid+1];
}

int main(){

  mytype *h_i, *d_i, *h_o, *d_o;
  int dszp = (DSIZE)*sizeof(mytype);

  h_i = (mytype *)malloc(dszp);
  h_o = (mytype *)malloc(dszp);
  if ((h_i == NULL) || (h_o == NULL)) {printf("malloc fail\n"); return 1;}
  cudaMalloc(&d_i, dszp);
  cudaMalloc(&d_o, dszp);
  cudaCheckErrors("cudaMalloc fail");
  for (int i = 0 ; i < DSIZE; i++){
    h_i[i] = i;
    h_o[i] = 0;}
  cudaMemset(d_o, 0, dszp);
  cudaCheckErrors("cudaMemset fail");
  cudaMemcpy(d_i, h_i, dszp, cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy 1 fail");
  prescan<<<1,DSIZE/2, dszp>>>(d_o, d_i, DSIZE);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  cudaMemcpy(h_o, d_o, dszp, cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy 2 fail");
  mytype psum = 0;
  for (int i =1; i < DSIZE; i++){
    psum += h_i[i-1];
    if (psum != h_o[i]) {printf("mismatch at %d, was: %d, should be: %d\n", i, h_o[i], psum); return 1;}
    }
  return 0;
}

#包括
#定义DSIZE512
#定义cudaCheckErrors（msg）\
做{\
cudaError\u t\u err=cudaGetLastError（）\
如果（_err！=cudaSuccess）{\
fprintf（标准，“致命错误：%s（%s位于%s:%d）\n”\
msg，cudaGetErrorString（_err）\
__文件（行）\
fprintf（stderr，“***失败-中止\n”）\
出口（1）\
} \
}而（0）
typedef int-mytype；
模板
__全局无效预扫描（T*g\U odata，T*g\U idata，int n）
{
extern _u共享_ut temp[]；//在调用时分配
int thid=threadIdx.x；
int offset=1；
temp[2*thid]=g_idata[2*thid]；//将输入加载到共享内存中
温度[2*thid+1]=g_idata[2*thid+1]；
对于（int d=n>>1；d>0；d>>=1）//在树上就地构建sum
{
__同步线程（）；
if（thid>=1；
__同步线程（）；
if（thid
在将GPU Gems 3中的代码转录到内核中时，您似乎至少犯了1个错误。此行不正确：
temp[bi] += g_idata[ai];

应该是：
temp[bi] += temp[ai];

当我对您现在发布的代码进行一次更改时，它似乎为我打印出了正确的（独占扫描）前缀和。我还想提及一些其他事项：
即使没有这个改变，我也会得到一些接近正确的结果。所以如果你得到的是完全不同的东西（例如否定的）