CUDA的示例程序GPU版本运行速度较慢或几乎与CPU版本相同_Cuda

CUDA的示例程序GPU版本运行速度较慢或几乎与CPU版本相同

cuda

CUDA的示例程序GPU版本运行速度较慢或几乎与CPU版本相同,cuda,Cuda,我正在读杰森·桑德斯和爱德华·坎德罗特写的《CUDA的榜样》。到目前为止，每一个程序都在我的电脑上顺利、正确地运行，直到我坚持使用第九章的程序这两个方案如下：第一个程序仅在CPU上运行，hist\u CPU.cu： /* * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. * * NVIDIA Corporation and its licensors retain all intellectual prop

我正在读杰森·桑德斯和爱德华·坎德罗特写的《CUDA的榜样》。到目前为止，每一个程序都在我的电脑上顺利、正确地运行，直到我坚持使用第九章的程序

这两个方案如下：

第一个程序仅在CPU上运行，hist\u CPU.cu：

/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and 
 * proprietary rights in and to this software and related documentation. 
 * Any use, reproduction, disclosure, or distribution of this software 
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA) 
 * associated with this source code for terms and conditions that govern 
 * your use of this NVIDIA software.
 * 
 */


#include "../common/book.h"

#define SIZE    (100*1024*1024)

int main( void ) {
    unsigned char *buffer =
                     (unsigned char*)big_random_block( SIZE );

    // capture the start time
    clock_t         start, stop;
    start = clock();

    unsigned int    histo[256];
    for (int i=0; i<256; i++)
        histo[i] = 0;

    for (int i=0; i<SIZE; i++)
        histo[buffer[i]]++;

    stop = clock();
    float   elapsedTime = (float)(stop - start) /
                          (float)CLOCKS_PER_SEC * 1000.0f;
    printf( "Time to generate:  %3.1f ms\n", elapsedTime );

    long histoCount = 0;
    for (int i=0; i<256; i++) {
        histoCount += histo[i];
    }
    printf( "Histogram Sum:  %ld\n", histoCount );

    free( buffer );
    return 0;
}

/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and 
 * proprietary rights in and to this software and related documentation. 
 * Any use, reproduction, disclosure, or distribution of this software 
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA) 
 * associated with this source code for terms and conditions that govern 
 * your use of this NVIDIA software.
 * 
 */


#include "../common/book.h"

#define SIZE    (100*1024*1024)


__global__ void histo_kernel( unsigned char *buffer,
                              long size,
                              unsigned int *histo ) {

    // clear out the accumulation buffer called temp
    // since we are launched with 256 threads, it is easy
    // to clear that memory with one write per thread
    __shared__  unsigned int temp[256];
    temp[threadIdx.x] = 0;
    __syncthreads();

    // calculate the starting index and the offset to the next
    // block that each thread will be processing
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    while (i < size) {
        atomicAdd( &temp[buffer[i]], 1 );
        i += stride;
    }
    // sync the data from the above writes to shared memory
    // then add the shared memory values to the values from
    // the other thread blocks using global memory
    // atomic adds
    // same as before, since we have 256 threads, updating the
    // global histogram is just one write per thread!
    __syncthreads();
    atomicAdd( &(histo[threadIdx.x]), temp[threadIdx.x] );
}

int main( void ) {
    unsigned char *buffer =
                     (unsigned char*)big_random_block( SIZE );

    // capture the start time
    // starting the timer here so that we include the cost of
    // all of the operations on the GPU.  if the data were
    // already on the GPU and we just timed the kernel
    // the timing would drop from 74 ms to 15 ms.  Very fast.
    cudaEvent_t     start, stop;
    HANDLE_ERROR( cudaEventCreate( &start ) );
    HANDLE_ERROR( cudaEventCreate( &stop ) );
    HANDLE_ERROR( cudaEventRecord( start, 0 ) );

    // allocate memory on the GPU for the file's data
    unsigned char *dev_buffer;
    unsigned int *dev_histo;
    HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );
    HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,
                              cudaMemcpyHostToDevice ) );

    HANDLE_ERROR( cudaMalloc( (void**)&dev_histo,
                              256 * sizeof( int ) ) );
    HANDLE_ERROR( cudaMemset( dev_histo, 0,
                              256 * sizeof( int ) ) );

    // kernel launch - 2x the number of mps gave best timing
    cudaDeviceProp  prop;
    HANDLE_ERROR( cudaGetDeviceProperties( &prop, 0 ) );
    int blocks = prop.multiProcessorCount;
    histo_kernel<<<blocks*2,256>>>( dev_buffer,
                                    SIZE, dev_histo );

    unsigned int    histo[256];
    HANDLE_ERROR( cudaMemcpy( histo, dev_histo,
                              256 * sizeof( int ),
                              cudaMemcpyDeviceToHost ) );

    // get stop time, and display the timing results
    HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
    HANDLE_ERROR( cudaEventSynchronize( stop ) );
    float   elapsedTime;
    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                        start, stop ) );
    printf( "Time to generate:  %3.1f ms\n", elapsedTime );

    long histoCount = 0;
    for (int i=0; i<256; i++) {
        histoCount += histo[i];
    }
    printf( "Histogram Sum:  %ld\n", histoCount );

    // verify that we have the same counts via CPU
    for (int i=0; i<SIZE; i++)
        histo[buffer[i]]--;
    for (int i=0; i<256; i++) {
        if (histo[i] != 0)
            printf( "Failure at %d!\n", i );
    }

    HANDLE_ERROR( cudaEventDestroy( start ) );
    HANDLE_ERROR( cudaEventDestroy( stop ) );
    cudaFree( dev_histo );
    cudaFree( dev_buffer );
    free( buffer );
    return 0;
}

2）Ubuntu 10.04（Lucid LTS）+GeForce GT 630

安装CUDA时显示文件“cudatoolkit_3.2.16_linux_64_ubuntu10.04.run”和“nvcc--version”

这两个平台都产生了CPU程序和GPU程序运行数百毫秒的结果。GPU有时甚至比CPU还长

可以观察到任何原因吗

这是我在visual studio 2010中看到的：

CUDA C/C++命令行：

# (Approximate command-line.  Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)

# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\"
"D:\NVIDIA\CUDA\CUDAToolkit\bin\nvcc.exe" --use-local-env --cl-version 2010 -ccbin "D:\Microsoft Visual Studio 10.0\VC\bin\x86_amd64"        --keep-dir x64\Release -maxrregcount=0  --machine 64 --compile -cudart static  -o x64\Release\%(Filename)%(Extension).obj "%(FullPath)"

# Runtime API (NVCC Compilation Type is hybrid object or .c file)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\" "D:\NVIDIA\CUDA\CUDAToolkit\bin\nvcc.exe" --use-local-env --cl-version 2010 -ccbin "D:\Microsoft Visual Studio 10.0\VC\bin\x86_amd64"        --keep-dir x64\Release -maxrregcount=0  --machine 64 --compile -cudart static      -Xcompiler "/EHsc  /nologo  /Zi    " -o x64\Release\%(Filename)%(Extension).obj "%(FullPath)"

/OUT:"E:\learn_cuda_by_example_exercises\Chapter9HistGpuShareMemory\x64\Release\Chapter9HistGpuShareMemory.exe" /NOLOGO 

/LIBPATH:"D:\NVIDIA\CUDA\CUDAToolkit\lib\x64" "glut64.lib" "cudart.lib" "kernel32.lib" "user32.lib" "gdi32.lib" "winspool.lib" "comdlg32.lib" "advapi32.lib" "shell32.lib" "ole32.lib" "oleaut32.lib" "uuid.lib" "odbc32.lib" "odbccp32.lib" /MANIFEST /ManifestFile:"x64\Release\Chapter9HistGpuShareMemory.exe.intermediate.manifest" /ALLOWISOLATION 

/MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG /PDB:"E:\learn_cuda_by_example_exercises\Chapter9HistGpuShareMemory\x64\Release\Chapter9HistGpuShareMemory.pdb" /OPT:REF /OPT:ICF /PGD:"E:\learn_cuda_by_example_exercises\Chapter9HistGpuShareMemory\x64\Release\Chapter9HistGpuShareMemory.pgd" /LTCG /TLBID:1 

/DYNAMICBASE /NXCOMPAT /MACHINE:X64 /ERRORREPORT:QUEUE

# (Approximate command-line.  Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)

"D:\NVIDIA\CUDA\CUDAToolkit\bin\nvcc.exe" -dlink -o x64\Release\Chapter9HistGpuShareMemory.device-link.obj -Xcompiler "/EHsc  /nologo  /Zi    "

链接器命令行：

# (Approximate command-line.  Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)

# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\"
"D:\NVIDIA\CUDA\CUDAToolkit\bin\nvcc.exe" --use-local-env --cl-version 2010 -ccbin "D:\Microsoft Visual Studio 10.0\VC\bin\x86_amd64"        --keep-dir x64\Release -maxrregcount=0  --machine 64 --compile -cudart static  -o x64\Release\%(Filename)%(Extension).obj "%(FullPath)"

# Runtime API (NVCC Compilation Type is hybrid object or .c file)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\" "D:\NVIDIA\CUDA\CUDAToolkit\bin\nvcc.exe" --use-local-env --cl-version 2010 -ccbin "D:\Microsoft Visual Studio 10.0\VC\bin\x86_amd64"        --keep-dir x64\Release -maxrregcount=0  --machine 64 --compile -cudart static      -Xcompiler "/EHsc  /nologo  /Zi    " -o x64\Release\%(Filename)%(Extension).obj "%(FullPath)"

/OUT:"E:\learn_cuda_by_example_exercises\Chapter9HistGpuShareMemory\x64\Release\Chapter9HistGpuShareMemory.exe" /NOLOGO 

/LIBPATH:"D:\NVIDIA\CUDA\CUDAToolkit\lib\x64" "glut64.lib" "cudart.lib" "kernel32.lib" "user32.lib" "gdi32.lib" "winspool.lib" "comdlg32.lib" "advapi32.lib" "shell32.lib" "ole32.lib" "oleaut32.lib" "uuid.lib" "odbc32.lib" "odbccp32.lib" /MANIFEST /ManifestFile:"x64\Release\Chapter9HistGpuShareMemory.exe.intermediate.manifest" /ALLOWISOLATION 

/MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG /PDB:"E:\learn_cuda_by_example_exercises\Chapter9HistGpuShareMemory\x64\Release\Chapter9HistGpuShareMemory.pdb" /OPT:REF /OPT:ICF /PGD:"E:\learn_cuda_by_example_exercises\Chapter9HistGpuShareMemory\x64\Release\Chapter9HistGpuShareMemory.pgd" /LTCG /TLBID:1 

/DYNAMICBASE /NXCOMPAT /MACHINE:X64 /ERRORREPORT:QUEUE

# (Approximate command-line.  Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)

"D:\NVIDIA\CUDA\CUDAToolkit\bin\nvcc.exe" -dlink -o x64\Release\Chapter9HistGpuShareMemory.device-link.obj -Xcompiler "/EHsc  /nologo  /Zi    "

CUDA链接器命令行：

# (Approximate command-line.  Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)

# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\"
"D:\NVIDIA\CUDA\CUDAToolkit\bin\nvcc.exe" --use-local-env --cl-version 2010 -ccbin "D:\Microsoft Visual Studio 10.0\VC\bin\x86_amd64"        --keep-dir x64\Release -maxrregcount=0  --machine 64 --compile -cudart static  -o x64\Release\%(Filename)%(Extension).obj "%(FullPath)"

# Runtime API (NVCC Compilation Type is hybrid object or .c file)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\" "D:\NVIDIA\CUDA\CUDAToolkit\bin\nvcc.exe" --use-local-env --cl-version 2010 -ccbin "D:\Microsoft Visual Studio 10.0\VC\bin\x86_amd64"        --keep-dir x64\Release -maxrregcount=0  --machine 64 --compile -cudart static      -Xcompiler "/EHsc  /nologo  /Zi    " -o x64\Release\%(Filename)%(Extension).obj "%(FullPath)"

/OUT:"E:\learn_cuda_by_example_exercises\Chapter9HistGpuShareMemory\x64\Release\Chapter9HistGpuShareMemory.exe" /NOLOGO 

/LIBPATH:"D:\NVIDIA\CUDA\CUDAToolkit\lib\x64" "glut64.lib" "cudart.lib" "kernel32.lib" "user32.lib" "gdi32.lib" "winspool.lib" "comdlg32.lib" "advapi32.lib" "shell32.lib" "ole32.lib" "oleaut32.lib" "uuid.lib" "odbc32.lib" "odbccp32.lib" /MANIFEST /ManifestFile:"x64\Release\Chapter9HistGpuShareMemory.exe.intermediate.manifest" /ALLOWISOLATION 

/MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG /PDB:"E:\learn_cuda_by_example_exercises\Chapter9HistGpuShareMemory\x64\Release\Chapter9HistGpuShareMemory.pdb" /OPT:REF /OPT:ICF /PGD:"E:\learn_cuda_by_example_exercises\Chapter9HistGpuShareMemory\x64\Release\Chapter9HistGpuShareMemory.pgd" /LTCG /TLBID:1 

/DYNAMICBASE /NXCOMPAT /MACHINE:X64 /ERRORREPORT:QUEUE

# (Approximate command-line.  Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)

"D:\NVIDIA\CUDA\CUDAToolkit\bin\nvcc.exe" -dlink -o x64\Release\Chapter9HistGpuShareMemory.device-link.obj -Xcompiler "/EHsc  /nologo  /Zi    "

我认为这里的主要问题是硬件

只有48个CUDA内核。有96或384，具体取决于版本。这个数字是640。顶级卡将是A，具有3072个CUDA内核

将705M与K2200进行比较，您会发现性能上的巨大差异并不奇怪

看起来你的笔记本电脑里有Optimus，你真的在专用GPU上运行这个程序吗？这是官方的源代码还是你做了什么改变？你到底是怎么编译的？你使用的是哪种编译器版本（例如gcc版本）？我的笔记本电脑上没有安装Optimus，不确定它是否在专用GPU上运行，如何计算它或进行什么配置来确认它的功能？在ubuntu上，我的gcc版本是gcc（ubuntu 4.4.3-4ubuntu5.1）4.4.3。你使用的编译器标志是什么？这个解释似乎是合理的。据我所知，Geforce 705M是一款GF117芯片，基本时钟为738 MHz，内存时钟为900 MHz。这是一个费米级部件，在热时钟（=2x基时钟）下运行CUDA内核，因此该部件具有2 x 738e6（1/s）*48（内核）*2（浮点/FMA）=141.7个单精度GFLOPs。内存接口仅为64位宽，因此带宽为900e6（1/s）*8（字节）*2（对于DDR）=14.4 GB/秒。四核CPU使用两个64位通道时，可以提供类似的计算吞吐量和几乎肯定更多的内存带宽（可能>20 GB/秒）。但是GeForce GTX 285卡呢，它似乎是一个非常旧的卡，但根据书中所述，仍然可以产生有希望的结果。书中是否给出了GTX 285的时间？GTX285（GT200芯片）仍然是一个相当强大的GPU，至少与这里讨论的GPU相比。它（相对地）有很多SP，也有很多内存带宽。这是一款高端GPU。GeForce 705M的功能要小得多（尽管它是新一代），而且它当时还是一款低端GPU。这就像问为什么法拉利GT308比雪佛兰名人更快。法拉利比较老，但一开始它是一辆快车。这位雪佛兰名人从来都不是一辆快车。是的，的确如此。它确实说hist_gpu shmem_atomics.cu程序比hist_cpu.cu程序增加了416毫秒。到57毫秒，几乎是7倍。杰兹。