Vector 如何在CUDA中对大小为64的数组进行向量缩减?

Vector 如何在CUDA中对大小为64的数组进行向量缩减?,vector,cuda,reduction,Vector,Cuda,Reduction,如何在CUDA中对大小为64的数组进行向量缩减 我的代码给出了预期答案的一半 __global__ void Reduce(double* in3,double* r,int size) { int id=blockIdx.x*blockDim.x + threadIdx.x; extern __shared__ double shareddata3[]; int tid=threadIdx.x; if(id<size) { shareddata3[tid]

如何在CUDA中对大小为64的数组进行向量缩减

我的代码给出了预期答案的一半

__global__ void Reduce(double* in3,double* r,int size)
{
  int id=blockIdx.x*blockDim.x + threadIdx.x;

  extern __shared__ double shareddata3[];

  int tid=threadIdx.x;

  if(id<size) {
    shareddata3[tid] =in3[id];
  }
  __syncthreads();

  for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
    if (tid < s3) {
      shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
    }
    __syncthreads();
  }

  if(tid==0) {
    r[0]=shareddata3[0];
  }
}
\uuuuu全局\uuuuuu无效减少(双*in3,双*r,整数大小)
{
int id=blockIdx.x*blockDim.x+threadIdx.x;
外部共享双共享数据3[];
int tid=threadIdx.x;
如果(id0;s3=s3>>1){
如果(tid
我的kernerl发布是:

Reduce<<<1,64,sharedmem3>>>(d_array,g,64);
Reduce(d_数组,g,64);

错误出现在您未向我们显示的部分代码中。下面是一个完整的可编译代码示例

#include "cuda_runtime.h"

#include <iostream>
using namespace std;

const int size(64);

__global__ void Reduce(double* in3,double* r,int size);

#define assertCudaSuccess(ans) { _assertCudaSuccess((ans), __FILE__, __LINE__); }
inline void _assertCudaSuccess(cudaError_t code, char *file, int line)
{
  if (code != cudaSuccess) {
    fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
    exit(code);
  }
}

int main()
{
  double* result_d;
  assertCudaSuccess(cudaMalloc(&result_d, 1 * sizeof(double)));

  double* result_h;
  assertCudaSuccess(cudaMallocHost(&result_h, 1 * sizeof(double)));

  double* in3_d;
  assertCudaSuccess(cudaMalloc(&in3_d, size * sizeof(double)));

  double* in3_h;
  assertCudaSuccess(cudaMallocHost(&in3_h, size * sizeof(double)));

  double expected_result(0);
  for (int i(0); i < size; ++i) {
    in3_h[i] = i;
    expected_result += i;
  }
  cout << "Expected result: " << expected_result << endl;

  assertCudaSuccess(cudaMemcpy(in3_d, in3_h, size * sizeof(double), cudaMemcpyHostToDevice));

  Reduce<<<1, size, size * sizeof(double)>>>(in3_d, result_d, size);

  assertCudaSuccess(cudaPeekAtLastError());
  assertCudaSuccess(cudaDeviceSynchronize());

  assertCudaSuccess(cudaMemcpy(result_h, result_d, 1 * sizeof(double), cudaMemcpyDeviceToHost));

  cout << "Actual result: " << *result_h << endl;

  assertCudaSuccess(cudaFree(result_d));
  assertCudaSuccess(cudaFreeHost(result_h));
  assertCudaSuccess(cudaFree(in3_d));
  assertCudaSuccess(cudaFreeHost(in3_h));

  cin.get();
  return 0;
}

__global__ void Reduce(double* in3, double* r, int size)
{
  int id=blockIdx.x*blockDim.x + threadIdx.x;

  extern __shared__ double shareddata3[];

  int tid=threadIdx.x;

  if(id<size) {
    shareddata3[tid] =in3[id];
  }
  __syncthreads();

  for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
    if (tid < s3) {
      shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
    }
    __syncthreads();
  }

  if(tid==0) {
    r[0] = shareddata3[0];
  }
}

如果您以后能抽出时间将问题正确格式化,我们将不胜感激:)如果您知道缩减的向量大小为64,为什么还要麻烦传递该大小?将其用作模板参数。。。看见
Expected result: 2,016
Actual result: 2,016