C++ CUDA:所有矢量元素的级联求和
我已经在我的GPU和CPU上实现了一个用于大向量浮点值的级联加法函数。这仅仅意味着这个向量壳的所有元素都可以被求和为一个结果。CPU算法非常简单,工作正常,但GPU算法总是偏离预期结果35200 算法的最小工作代码以及与CPU的比较如下 输出总是这样的:C++ CUDA:所有矢量元素的级联求和,c++,algorithm,cuda,add,atomic,C++,Algorithm,Cuda,Add,Atomic,我已经在我的GPU和CPU上实现了一个用于大向量浮点值的级联加法函数。这仅仅意味着这个向量壳的所有元素都可以被求和为一个结果。CPU算法非常简单,工作正常,但GPU算法总是偏离预期结果35200 算法的最小工作代码以及与CPU的比较如下 输出总是这样的: CPU Time: 22.760059 ms, bandwidth: 3.514929 GB/s GPU Time (improved): 12.077088 ms, bandwidth: 6.624114 GB/s - CPU result
CPU Time: 22.760059 ms, bandwidth: 3.514929 GB/s
GPU Time (improved): 12.077088 ms, bandwidth: 6.624114 GB/s
- CPU result does not match GPU result in improved atomic add.
CPU: 10000000.000000, GPU: 10035200.000000, diff:-35200.000000
我用cuda memcheck检查了它,但在运行过程中没有出现错误。我尝试过许多不同的方法,但都不奏效。这可能不是因为float数据类型的不准确,因为我将所有float都更改为int,仍然得到了完全相同的结果
这是我的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
void reductionWithCudaImproved(float *result, const float *input);
__global__ void reductionKernelImproved(float *result, const float *input);
void reductionCPU(float *result, const float *input);
#define SIZE 10000000
#define TILE 32
#define ILP 8
#define BLOCK_X_IMPR (TILE / ILP)
#define BLOCK_Y_IMPR 32
#define BLOCK_COUNT_X_IMPR 100
int main()
{
int i;
float *input;
float resultCPU, resultGPU;
double cpuTime, cpuBandwidth;
input = (float*)malloc(SIZE * sizeof(float));
resultCPU = 0.0;
resultGPU = 0.0;
srand((int)time(NULL));
auto start = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();
for (i = 0; i < SIZE; i++)
input[i] = 1.0;
start = std::chrono::high_resolution_clock::now();
reductionCPU(&resultCPU, input);
end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
cpuTime = (diff.count() * 1000);
cpuBandwidth = (sizeof(float) * SIZE * 2) / (cpuTime * 1000000);
printf("CPU Time: %f ms, bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);
reductionWithCudaImproved(&resultGPU, input);
if (resultCPU != resultGPU)
printf("- CPU result does not match GPU result in improved atomic add. CPU: %f, GPU: %f, diff:%f\n\n", resultCPU, resultGPU, (resultCPU - resultGPU));
else
printf("+ CPU result matches GPU result in improved atomic add. CPU: %f, GPU: %f\n\n", resultCPU, resultGPU);
return 0;
}
void reductionCPU(float *result, const float *input)
{
for (int i = 0; i < SIZE; i++)
*result += input[i];
}
__global__ void reductionKernelImproved(float *result, const float *input)
{
int i;
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
__shared__ float interResult;
if (threadIdx.x == 0 && threadIdx.y == 0)
interResult = 0.0;
__syncthreads();
#pragma unroll ILP
for (i = 0; i < ILP; i++)
{
if (index < SIZE)
{
atomicAdd(&interResult, input[index]);
index++;
}
}
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
atomicAdd(result, interResult);
}
void reductionWithCudaImproved(float *result, const float *input)
{
dim3 dim_grid, dim_block;
float *dev_input = 0;
float *dev_result = 0;
cudaEvent_t start, stop;
float elapsed = 0;
double gpuBandwidth;
dim_block.x = BLOCK_X_IMPR;
dim_block.y = BLOCK_Y_IMPR;
dim_block.z = 1;
dim_grid.x = BLOCK_COUNT_X_IMPR;
dim_grid.y = (int)ceil((float)SIZE / (float)(TILE * dim_block.y* BLOCK_COUNT_X_IMPR));
dim_grid.z = 1;
cudaSetDevice(0);
cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
cudaMalloc((void**)&dev_result, sizeof(float));
cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reductionKernelImproved << <dim_grid, dim_block >> >(dev_result, dev_input);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
gpuBandwidth = (sizeof(float) * SIZE * 2) / (elapsed * 1000000);
printf("GPU Time (improved): %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
cudaDeviceSynchronize();
cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dev_input);
cudaFree(dev_result);
return;
}
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
#包括
#包括
通过CUDAIMPROVED(浮点*结果,常量浮点*输入)减少无效值;
__全局无效还原内核改进(浮点*结果,常量浮点*输入);
无效还原CPU(浮点*结果,常量浮点*输入);
#定义大小10000000
#定义磁贴32
#定义ILP 8
#定义块×改进(平铺/ILP)
#定义块_Y_IMPR 32
#定义块计数提升100
int main()
{
int i;
浮点*输入;
浮动结果pu,结果pu;
双cpuTime、cpuBandwidth;
输入=(float*)malloc(SIZE*sizeof(float));
结果pu=0.0;
结果pu=0.0;
srand((int)time(NULL));
自动启动=标准::时钟::高分辨率时钟::现在();
自动结束=标准::时钟::高分辨率时钟::现在();
对于(i=0;i(开发结果、开发输入);
cudaEventRecord(停止);
CUDAEVENTS同步(停止);
CUDAEventReleasedTime(已用、开始、停止(&E));
gpuBandwidth=(sizeof(float)*SIZE*2)/(已用时间*1000000);
printf(“GPU时间(改进):%f毫秒,带宽:%f GB/s\n”,已用时间,GPU带宽和宽度);
cudaDeviceSynchronize();
cudaMemcpy(结果、开发结果、sizeof(float)、cudaMemcpyDeviceToHost);
cudaFree(dev_输入);
cudaFree(开发结果);
返回;
}
我认为内核调用中存在重叠索引:
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
如果我没有弄错的话,您的blockDim.x=4,BLOCK_COUNT_x_IMPR=100,因此每行将跳转400个索引。
但是,您的col可以高达400*8
考虑:
blockIdx = (12, 0)
threadIdx = (3, 0)
=> col = (12*4 + 3) * 8 = 408
row = 0
index = 408
blockIdx = (0, 0)
threadIdx = (1, 1)
=> col = (0*4 + 1) * 8 = 8
row = 1
index = 1 * 400 + 8 = 408
所以我想你应该重写你的索引
// gridDim.x = BLOCK_COUNT_X_IMPR
int index = row * blockDim.x * gridDim.x * ILP + col;
我认为内核调用中存在重叠索引:
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
如果我没有弄错的话,您的blockDim.x=4,BLOCK_COUNT_x_IMPR=100,因此每行将跳转400个索引。
但是,您的col可以高达400*8
考虑:
blockIdx = (12, 0)
threadIdx = (3, 0)
=> col = (12*4 + 3) * 8 = 408
row = 0
index = 408
blockIdx = (0, 0)
threadIdx = (1, 1)
=> col = (0*4 + 1) * 8 = 8
row = 1
index = 1 * 400 + 8 = 408
所以我想你应该重写你的索引
// gridDim.x = BLOCK_COUNT_X_IMPR
int index = row * blockDim.x * gridDim.x * ILP + col;