C++ CUDA：所有矢量元素的级联求和_C++_Algorithm_Cuda_Add_Atomic

C++ CUDA：所有矢量元素的级联求和

c++ algorithm cuda

C++ CUDA：所有矢量元素的级联求和,c++,algorithm,cuda,add,atomic,C++,Algorithm,Cuda,Add,Atomic,我已经在我的GPU和CPU上实现了一个用于大向量浮点值的级联加法函数。这仅仅意味着这个向量壳的所有元素都可以被求和为一个结果。CPU算法非常简单，工作正常，但GPU算法总是偏离预期结果35200 算法的最小工作代码以及与CPU的比较如下输出总是这样的： CPU Time: 22.760059 ms, bandwidth: 3.514929 GB/s GPU Time (improved): 12.077088 ms, bandwidth: 6.624114 GB/s - CPU result

我已经在我的GPU和CPU上实现了一个用于大向量浮点值的级联加法函数。这仅仅意味着这个向量壳的所有元素都可以被求和为一个结果。CPU算法非常简单，工作正常，但GPU算法总是偏离预期结果35200

算法的最小工作代码以及与CPU的比较如下

输出总是这样的：

CPU Time: 22.760059 ms, bandwidth: 3.514929 GB/s

GPU Time (improved): 12.077088 ms, bandwidth: 6.624114 GB/s
- CPU result does not match GPU result in improved atomic add.
   CPU: 10000000.000000, GPU: 10035200.000000, diff:-35200.000000

我用cuda memcheck检查了它，但在运行过程中没有出现错误。我尝试过许多不同的方法，但都不奏效。这可能不是因为float数据类型的不准确，因为我将所有float都更改为int，仍然得到了完全相同的结果

这是我的代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

void reductionWithCudaImproved(float *result, const float *input);
__global__ void reductionKernelImproved(float *result, const float *input);
void reductionCPU(float *result, const float *input);

#define SIZE 10000000

#define TILE 32

#define ILP 8
#define BLOCK_X_IMPR (TILE / ILP)
#define BLOCK_Y_IMPR 32
#define BLOCK_COUNT_X_IMPR 100

int main()
{
    int i;
    float *input;
    float resultCPU, resultGPU;
    double cpuTime, cpuBandwidth;

    input = (float*)malloc(SIZE * sizeof(float));
    resultCPU = 0.0;
    resultGPU = 0.0;

    srand((int)time(NULL));

    auto start = std::chrono::high_resolution_clock::now();
    auto end = std::chrono::high_resolution_clock::now();

    for (i = 0; i < SIZE; i++)
        input[i] = 1.0;

    start = std::chrono::high_resolution_clock::now();
    reductionCPU(&resultCPU, input);
    end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;
    cpuTime = (diff.count() * 1000);
    cpuBandwidth = (sizeof(float) * SIZE * 2) / (cpuTime * 1000000);
    printf("CPU Time: %f ms, bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);

    reductionWithCudaImproved(&resultGPU, input);

    if (resultCPU != resultGPU)
        printf("- CPU result does not match GPU result in improved atomic add. CPU: %f, GPU: %f, diff:%f\n\n", resultCPU, resultGPU, (resultCPU - resultGPU));
    else
        printf("+ CPU result matches GPU result in improved atomic add. CPU: %f, GPU: %f\n\n", resultCPU, resultGPU);

    return 0;
}

void reductionCPU(float *result, const float *input)
{
    for (int i = 0; i < SIZE; i++)
        *result += input[i];
}

__global__ void reductionKernelImproved(float *result, const float *input)
{
    int i;
    int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
    __shared__ float interResult;

    if (threadIdx.x == 0 && threadIdx.y == 0)
        interResult = 0.0;

    __syncthreads();

#pragma unroll ILP
    for (i = 0; i < ILP; i++)
    {
        if (index < SIZE)
        {
            atomicAdd(&interResult, input[index]);
            index++;
        }
    }

    __syncthreads();

    if (threadIdx.x == 0 && threadIdx.y == 0)
        atomicAdd(result, interResult);
}

void reductionWithCudaImproved(float *result, const float *input)
{
    dim3 dim_grid, dim_block;

    float *dev_input = 0;
    float *dev_result = 0;
    cudaEvent_t start, stop;
    float elapsed = 0;
    double gpuBandwidth;

    dim_block.x = BLOCK_X_IMPR;
    dim_block.y = BLOCK_Y_IMPR;
    dim_block.z = 1;

    dim_grid.x = BLOCK_COUNT_X_IMPR;
    dim_grid.y = (int)ceil((float)SIZE / (float)(TILE * dim_block.y* BLOCK_COUNT_X_IMPR));
    dim_grid.z = 1;

    cudaSetDevice(0);

    cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
    cudaMalloc((void**)&dev_result, sizeof(float));
    cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);

    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    reductionKernelImproved << <dim_grid, dim_block >> >(dev_result, dev_input);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    cudaEventElapsedTime(&elapsed, start, stop);

    gpuBandwidth = (sizeof(float) * SIZE * 2) / (elapsed * 1000000);
    printf("GPU Time (improved): %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

    cudaDeviceSynchronize();

    cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(dev_input);
    cudaFree(dev_result);

    return;
}

#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
#包括
#包括
通过CUDAIMPROVED（浮点*结果，常量浮点*输入）减少无效值；
__全局无效还原内核改进（浮点*结果，常量浮点*输入）；
无效还原CPU（浮点*结果，常量浮点*输入）；
#定义大小10000000
#定义磁贴32
#定义ILP 8
#定义块×改进（平铺/ILP）
#定义块_Y_IMPR 32
#定义块计数提升100
int main（）
{
int i；
浮点*输入；
浮动结果pu，结果pu；
双cpuTime、cpuBandwidth；
输入=（float*）malloc（SIZE*sizeof（float））；
结果pu=0.0；
结果pu=0.0；
srand（（int）time（NULL））；
自动启动=标准：：时钟：：高分辨率时钟：：现在（）；
自动结束=标准：：时钟：：高分辨率时钟：：现在（）；
对于（i=0；i（开发结果、开发输入）；
cudaEventRecord（停止）；
CUDAEVENTS同步（停止）；
CUDAEventReleasedTime（已用、开始、停止（&E））；
gpuBandwidth=（sizeof（float）*SIZE*2）/（已用时间*1000000）；
printf（“GPU时间（改进）：%f毫秒，带宽：%f GB/s\n”，已用时间，GPU带宽和宽度）；
cudaDeviceSynchronize（）；
cudaMemcpy（结果、开发结果、sizeof（float）、cudaMemcpyDeviceToHost）；
cudaFree（dev_输入）；
cudaFree（开发结果）；
返回；
}

我认为内核调用中存在重叠索引：

int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;

如果我没有弄错的话，您的blockDim.x=4，BLOCK_COUNT_x_IMPR=100，因此每行将跳转400个索引。但是，您的col可以高达400*8

考虑：

blockIdx = (12, 0)
threadIdx = (3, 0)
=> col = (12*4 + 3) * 8 = 408
   row = 0
   index = 408

blockIdx = (0, 0)
threadIdx = (1, 1)
=> col = (0*4 + 1) * 8 = 8
   row = 1
   index = 1 * 400 + 8 = 408

所以我想你应该重写你的索引

// gridDim.x = BLOCK_COUNT_X_IMPR
int index = row * blockDim.x * gridDim.x * ILP + col;

我认为内核调用中存在重叠索引：

int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;

如果我没有弄错的话，您的blockDim.x=4，BLOCK_COUNT_x_IMPR=100，因此每行将跳转400个索引。但是，您的col可以高达400*8

考虑：

blockIdx = (12, 0)
threadIdx = (3, 0)
=> col = (12*4 + 3) * 8 = 408
   row = 0
   index = 408

blockIdx = (0, 0)
threadIdx = (1, 1)
=> col = (0*4 + 1) * 8 = 8
   row = 1
   index = 1 * 400 + 8 = 408

所以我想你应该重写你的索引

// gridDim.x = BLOCK_COUNT_X_IMPR
int index = row * blockDim.x * gridDim.x * ILP + col;