Cuda 为什么我的包容性扫描代码在CPU上比在GPU上快2倍？_Cuda_Sum_Cumulative Sum_Cub

Cuda 为什么我的包容性扫描代码在CPU上比在GPU上快2倍？

cuda

Cuda 为什么我的包容性扫描代码在CPU上比在GPU上快2倍？,cuda,sum,cumulative-sum,cub,Cuda,Sum,Cumulative Sum,Cub,我编写了一个简短的CUDA程序，它使用来演示一个旧的四核Intel Q6600处理器的一个内核（所有四个处理器都应该能够达到~30gflops/sec）可以比Nvidia 750 Ti（应该能够达到1306gflops/sec的单精度）更快地对100000个元素进行包容性扫描（或者累积/前缀和）. 为什么会这样源代码是： #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <cub/cub.c

我编写了一个简短的CUDA程序，它使用来演示一个旧的四核Intel Q6600处理器的一个内核（所有四个处理器都应该能够达到~30gflops/sec）可以比Nvidia 750 Ti（应该能够达到1306gflops/sec的单精度）更快地对100000个元素进行包容性扫描（或者累积/前缀和）. 为什么会这样

源代码是：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cub/cub.cuh>

#include <stdio.h>
#include <time.h>
#include <algorithm>


#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

void fillArrayWithRandom(float* inputArray, int inputN)
{
    for (int i = 0; i < inputN; i++)
    {
        inputArray[i] = (float)rand() / float(RAND_MAX);
    }
}

void inclusiveSum_CPU(float *inputArray, float *inputSummedArray, int inputN)
{
    for (int i = 0; i < inputN; i++)
    {
        if (i > 0)
        {
            inputSummedArray[i] = inputSummedArray[i - 1] + inputArray[i];
        }
        else
        {
            inputSummedArray[i] = inputArray[i];
        }
    }
}

int main()
{
    int N = 100000; //1 hundred thousand elements
    float numSimulations = 10000;

    //Make Host Arrays
    float* testArray_CPU = (float *)malloc(sizeof(float)*N);
    fillArrayWithRandom(testArray_CPU, N);
    float* testArrayOutput_CPU = (float *)malloc(sizeof(float)*N);

    //Make GPU Arrays
    float* testArray_GPU;
    gpuErrchk(cudaMalloc(&testArray_GPU, N*sizeof(float)));
    gpuErrchk(cudaMemcpy(testArray_GPU, testArray_CPU, N*sizeof(float), cudaMemcpyHostToDevice));
    float* testArrayOutput_GPU;
    gpuErrchk(cudaMalloc(&testArrayOutput_GPU, N*sizeof(float)));

    //Initiate the benchmark variables
    clock_t begin_CPU, end_CPU;
    float time_spent_GPU, time_spent_CPU;

    //GPU prep
    void     *d_temp_storage = NULL;
    size_t   temp_storage_bytes = 0;
    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, testArray_GPU, testArrayOutput_GPU, N);
    gpuErrchk(cudaMalloc(&d_temp_storage, temp_storage_bytes));

    //GPU Timing

    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));
    gpuErrchk(cudaEventRecord(start, 0));
    for (int i = 0; i < numSimulations; i++)
    {
        cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, testArray_GPU, testArrayOutput_GPU, N);
    }
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaEventRecord(stop, 0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&time_spent_GPU, start, stop));

    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess)
    {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        exit(-1);
    }

    time_spent_GPU = (float)(time_spent_GPU / 1000);
    float avg_GPU = time_spent_GPU / numSimulations;
    printf("Avg. GPU Simulation Time: %.17g [sim/sec]\n", avg_GPU);

    //CPU Timing
    begin_CPU = clock();
    for (int i = 0; i < numSimulations; i++)
    {
        inclusiveSum_CPU(testArray_CPU, testArrayOutput_CPU, N);
    }
    end_CPU = clock();
    time_spent_CPU = (float)(end_CPU - begin_CPU) / CLOCKS_PER_SEC;
    float avg_CPU = time_spent_CPU / numSimulations;
    printf("Avg. CPU Simulation Time: %.17g [sim/sec]\n", avg_CPU);

    printf("GPU/CPU Timing:%.17gx \n", avg_GPU / avg_CPU);

    return 0;
}

多亏了Robert Crovella，我才发现我使用的是“调试”模式，这是出了名的慢，而不是“发布”模式。

你好！它实际上一直都是10万条——我只是把我发布的最初链接中的评论搞乱了，但几分钟后我就把它修好了。是的，精确的2倍加速是令人惊讶的，但这确实是饼干破碎的原因。我觉得clock（）这个东西有点让人毛骨悚然，但我还是用CUDA的计时例程更新了代码。我省略了错误检查，因为示例的目的是为了简单，而不是为了生产质量代码。根据要求，我增加了模拟计数（尽管它没有实质性的影响）并加入了代码。新运行的输出和CUDA计时是：平均GPU模拟时间：0.0010889752302318811[sim/sec]平均CPU模拟时间：0.0005685999640263617[sim/sec]GPU/CPU计时：1.9151870012283325x，你用的是什么CUDA卡？是的，就是它。不要构建调试配置。构建发布配置。设备代码的调试开关（

-G

）会抑制大多数编译器优化，这会显著提高执行速度。作为补充说明，windows在这里可能有轻微的影响。如果您想要比较perf，您可以通过将迭代计数减少到100，但将元素从100000增加到1000000来减轻这种影响。这将减少内核调用，但增加每个内核的工作，以分摊windows开销。但是，

-G

开关是主要的东西，有人为这场闹剧添加了答案。要么删除它，要么删除它。你总是让我发笑@Talonmes我很高兴你没有“离开大楼”。有趣的是，“调试模式”会让GPU的执行速度慢于CPU的执行速度。我不太记得NVIDIA的医生在任何地方说过这样的话。

1>------ Build started: Project: speedTest, Configuration: Debug Win32 ------
1>  Compiling CUDA source file kernel.cu...
1>  
1>  C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\speedTest>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\nvcc.exe" -gencode=arch=compute_50,code=\"sm_50,compute_50\" --use-local-env --cl-version 2013 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin" -rdc=true -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\include"  -G   --keep-dir Debug -maxrregcount=0  --machine 32 --compile -cudart static  -g   -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd  " -o Debug\kernel.cu.obj "C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\speedTest\kernel.cu" 
1>  kernel.cu
1>  
1>  C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\speedTest>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\nvcc.exe" -dlink -o Debug\speedTest.device-link.obj -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd  " -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32" cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib  -gencode=arch=compute_50,code=sm_50 -G --machine 32 Debug\kernel.cu.obj 
1>  cudart.lib
1>  kernel32.lib
1>  user32.lib
1>  gdi32.lib
1>  winspool.lib
1>  comdlg32.lib
1>  advapi32.lib
1>  shell32.lib
1>  ole32.lib
1>  oleaut32.lib
1>  uuid.lib
1>  odbc32.lib
1>  odbccp32.lib
1>  kernel.cu.obj
1>  speedTest.vcxproj -> C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\Debug\speedTest.exe
1>  copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\cudart*.dll" "C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\Debug\"
1>  C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\cudart32_65.dll
1>  C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\cudart64_65.dll
1>          2 file(s) copied.
========== Build: 1 succeeded, 0 failed, 0 up-to-date, 0 skipped ==========