Vector 如何在CUDA中对大小为64的数组进行向量缩减?
如何在CUDA中对大小为64的数组进行向量缩减 我的代码给出了预期答案的一半Vector 如何在CUDA中对大小为64的数组进行向量缩减?,vector,cuda,reduction,Vector,Cuda,Reduction,如何在CUDA中对大小为64的数组进行向量缩减 我的代码给出了预期答案的一半 __global__ void Reduce(double* in3,double* r,int size) { int id=blockIdx.x*blockDim.x + threadIdx.x; extern __shared__ double shareddata3[]; int tid=threadIdx.x; if(id<size) { shareddata3[tid]
__global__ void Reduce(double* in3,double* r,int size)
{
int id=blockIdx.x*blockDim.x + threadIdx.x;
extern __shared__ double shareddata3[];
int tid=threadIdx.x;
if(id<size) {
shareddata3[tid] =in3[id];
}
__syncthreads();
for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
if (tid < s3) {
shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
}
__syncthreads();
}
if(tid==0) {
r[0]=shareddata3[0];
}
}
\uuuuu全局\uuuuuu无效减少(双*in3,双*r,整数大小)
{
int id=blockIdx.x*blockDim.x+threadIdx.x;
外部共享双共享数据3[];
int tid=threadIdx.x;
如果(id0;s3=s3>>1){
如果(tid
我的kernerl发布是:
Reduce<<<1,64,sharedmem3>>>(d_array,g,64);
Reduce(d_数组,g,64);
错误出现在您未向我们显示的部分代码中。下面是一个完整的可编译代码示例
#include "cuda_runtime.h"
#include <iostream>
using namespace std;
const int size(64);
__global__ void Reduce(double* in3,double* r,int size);
#define assertCudaSuccess(ans) { _assertCudaSuccess((ans), __FILE__, __LINE__); }
inline void _assertCudaSuccess(cudaError_t code, char *file, int line)
{
if (code != cudaSuccess) {
fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
int main()
{
double* result_d;
assertCudaSuccess(cudaMalloc(&result_d, 1 * sizeof(double)));
double* result_h;
assertCudaSuccess(cudaMallocHost(&result_h, 1 * sizeof(double)));
double* in3_d;
assertCudaSuccess(cudaMalloc(&in3_d, size * sizeof(double)));
double* in3_h;
assertCudaSuccess(cudaMallocHost(&in3_h, size * sizeof(double)));
double expected_result(0);
for (int i(0); i < size; ++i) {
in3_h[i] = i;
expected_result += i;
}
cout << "Expected result: " << expected_result << endl;
assertCudaSuccess(cudaMemcpy(in3_d, in3_h, size * sizeof(double), cudaMemcpyHostToDevice));
Reduce<<<1, size, size * sizeof(double)>>>(in3_d, result_d, size);
assertCudaSuccess(cudaPeekAtLastError());
assertCudaSuccess(cudaDeviceSynchronize());
assertCudaSuccess(cudaMemcpy(result_h, result_d, 1 * sizeof(double), cudaMemcpyDeviceToHost));
cout << "Actual result: " << *result_h << endl;
assertCudaSuccess(cudaFree(result_d));
assertCudaSuccess(cudaFreeHost(result_h));
assertCudaSuccess(cudaFree(in3_d));
assertCudaSuccess(cudaFreeHost(in3_h));
cin.get();
return 0;
}
__global__ void Reduce(double* in3, double* r, int size)
{
int id=blockIdx.x*blockDim.x + threadIdx.x;
extern __shared__ double shareddata3[];
int tid=threadIdx.x;
if(id<size) {
shareddata3[tid] =in3[id];
}
__syncthreads();
for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
if (tid < s3) {
shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
}
__syncthreads();
}
if(tid==0) {
r[0] = shareddata3[0];
}
}
如果您以后能抽出时间将问题正确格式化,我们将不胜感激:)如果您知道缩减的向量大小为64,为什么还要麻烦传递该大小?将其用作模板参数。。。看见
Expected result: 2,016
Actual result: 2,016