CUDA设备内存拷贝:cudaMemcpyDeviceToDevice vs copy内核

CUDA设备内存拷贝:cudaMemcpyDeviceToDevice vs copy内核,c,cuda,C,Cuda,我正在编写一个cuda内核来将一个数组复制到另一个数组。它们都在GPU内存中。我不想使用cudamemcpyDeviceToDevice,因为它的性能很差 朴素内核: __global__ void GpuCopy( float* des , float* __restrict__ sour ,const int M , const int N ) { int tx=blockIdx.x*blockDim.x+threadIdx.x; if(tx<N*M)

我正在编写一个cuda内核来将一个数组复制到另一个数组。它们都在GPU内存中。我不想使用
cudamemcpyDeviceToDevice
,因为它的性能很差

朴素内核:

__global__ void GpuCopy( float* des , float* __restrict__ sour ,const int M , const int N )
{
    int tx=blockIdx.x*blockDim.x+threadIdx.x;
    if(tx<N*M)
        des[tx]=sour[tx];
}   
前一个代码段将全局内存复制到
des[]
,而后者将全局内存复制到
\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu。我认为后者比前者慢


那么,如何编写一个
\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu共享的
代码来复制内存呢?另一个问题是,如果我想使用
\uuuuu const\uuuuuu
内存,而数组(已经在GPU中)比常量内存大,那么如何使用
\uuu const\uuuuuu
将其复制到另一个GPU内存?

对于普通的线性到线性内存复制,共享内存不会给您带来任何好处。您的朴素内核应该很好。在使用较少数量的线程块运行时,可能会有一些小的优化,但在某种程度上,这取决于特定的GPU

共享内存可以在执行某种修改复制(如转置操作)的内核中发挥良好的作用。在这些情况下,通过共享内存进行传输的成本被改进的聚合性能所抵消。但是对于您的幼稚内核,读和写应该结合在一起

对于单个大拷贝操作,
cudaMemcpyDeviceToDevice
应该提供非常好的性能,因为单个调用的开销在整个数据移动中分摊。也许您应该对这两种方法计时——使用
nvprof
很容易。注释中引用的讨论涉及矩阵象限被交换的特定用例。在这种情况下,NxN矩阵需要~1.5N
cudaMemcpy
操作,但与单个内核调用相比。在这种情况下,API调用设置的开销将开始成为一个重要因素。但是,当比较单个
cudaMemcpy
操作与单个等效内核调用时,
cudaMemcpy
操作应该是快速的


\uuuu常量\uuuuuu
内存不能被设备代码修改,因此您必须使用基于
CudamCpyFromSymbol
CudamCpyFromSymbol
的主机代码。Robert Crovella已经回答了这个问题。我在这里只是提供了一个示例代码来比较CUDA中从设备到设备的内存拷贝的两种方法:

  • 使用cudaMemcpyDeviceToDevice
  • 使用复制内核
  • 代码

    测试代码如下所示:

    #include <stdio.h>
    
    #include "Utilities.cuh"
    #include "TimingGPU.cuh"
    
    #define BLOCKSIZE   512
    
    /***************/
    /* COPY KERNEL */
    /***************/
    __global__ void copyKernel(const double * __restrict__ d_in, double * __restrict__ d_out, const int N) {
    
        const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
        if (tid >= N) return;
    
        d_out[tid] = d_in[tid];
    
    }
    
    /********/
    /* MAIN */
    /********/
    int main() {
    
        const int N = 1000000;
    
        TimingGPU timerGPU;
    
        double *h_test = (double *)malloc(N * sizeof(double));
    
        for (int k = 0; k < N; k++) h_test[k] = 1.;
    
        double *d_in;   gpuErrchk(cudaMalloc(&d_in, N * sizeof(double)));
        gpuErrchk(cudaMemcpy(d_in, h_test, N * sizeof(double), cudaMemcpyHostToDevice));
    
        double *d_out; gpuErrchk(cudaMalloc(&d_out, N * sizeof(double)));
    
        timerGPU.StartCounter();
        gpuErrchk(cudaMemcpy(d_out, d_in, N * sizeof(double), cudaMemcpyDeviceToDevice));
        printf("cudaMemcpy timing = %f [ms]\n", timerGPU.GetCounter());
    
        timerGPU.StartCounter();
        copyKernel << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(d_in, d_out, N);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
        printf("Copy kernel timing = %f [ms]\n", timerGPU.GetCounter());
    
        return 0;
    }
    
    结果证实了Robert Crovella的猜想:
    cudaMemcpyDeviceToDevice
    通常比拷贝内核更可取

    #include <iostream>
    #include <vector>
    #include <iomanip>
    #include <cuda_runtime.h>
    
    #define CHECK_CUDA(cond) check_cuda(cond, __LINE__)
    
    void check_cuda(cudaError_t status, std::size_t line)
    {
        if(status != cudaSuccess)
        {
            std::cout << cudaGetErrorString(status) << '\n';
            std::cout << "Line: " << line << '\n';
            throw 0;
        }
    }
    
    __global__ void copy_kernel(float* __restrict__ output, const float* __restrict__ input, int N)
    {
        for (int i = blockIdx.x * blockDim.x + threadIdx.x;  i < N; i += blockDim.x * gridDim.x) 
            output[i] = input[i];
    }
    
    int main()
    {
        constexpr int num_trials = 100;
        std::vector<int> test_sizes = { 100'000, 1'000'000, 10'000'000, 100'000'000, 250'000'000 };
    
        int grid_size = 0, block_size = 0;
        CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, copy_kernel, 0));
    
        std::cout << std::fixed << std::setprecision(4) << std::endl;
    
        for (auto sz : test_sizes)
        {
            std::cout << "Test Size: " << sz << '\n';
    
            float *d_vector_src = nullptr, *d_vector_dest = nullptr;
            CHECK_CUDA(cudaMalloc(&d_vector_src, sz * sizeof(float)));
            CHECK_CUDA(cudaMalloc(&d_vector_dest, sz * sizeof(float)));
    
            cudaEvent_t start, stop;
            CHECK_CUDA(cudaEventCreate(&start));
            CHECK_CUDA(cudaEventCreate(&stop));
    
            float accumulate = 0.0;
            for (int i = 0; i < num_trials; i++)
            {
                CHECK_CUDA(cudaEventRecord(start));
                copy_kernel<<<grid_size, block_size>>>(d_vector_dest, d_vector_src, sz);
                CHECK_CUDA(cudaEventRecord(stop));
                CHECK_CUDA(cudaEventSynchronize(stop));
    
                float current_time = 0;
                CHECK_CUDA(cudaEventElapsedTime(&current_time, start, stop));
                accumulate += current_time;
            }
            std::cout << "\tKernel Copy Time: " << accumulate / num_trials << "ms\n";
    
            accumulate = 0.0;
            for (int i = 0; i < num_trials; i++)
            {
                CHECK_CUDA(cudaEventRecord(start));
                CHECK_CUDA(cudaMemcpy(d_vector_dest, d_vector_src, sz * sizeof(float), cudaMemcpyDeviceToDevice));
                CHECK_CUDA(cudaEventRecord(stop));
                CHECK_CUDA(cudaEventSynchronize(stop));
    
                float current_time = 0;
                CHECK_CUDA(cudaEventElapsedTime(&current_time, start, stop));
                accumulate += current_time;
            }
            std::cout << "\tMemcpy Time: " << accumulate / num_trials << "ms\n";
    
            CHECK_CUDA(cudaFree(d_vector_src));
            CHECK_CUDA(cudaFree(d_vector_dest));
        }
    
        return 0;
    }
    
    GTX 1080 Ti

    Test Size: 100000
        Kernel Copy Time: 0.0166ms
        Memcpy Time: 0.0188ms
    Test Size: 1000000
        Kernel Copy Time: 0.0580ms
        Memcpy Time: 0.0727ms
    Test Size: 10000000
        Kernel Copy Time: 0.4674ms
        Memcpy Time: 0.5047ms
    Test Size: 100000000
        Kernel Copy Time: 4.7992ms
        Memcpy Time: 3.7722ms
    Test Size: 250000000
        Kernel Copy Time: 7.2485ms
        Memcpy Time: 5.5863ms
    Test Size: 1000000000
        Kernel Copy Time: 31.5570ms
        Memcpy Time: 22.3184ms
    
    Test Size: 100000
        Kernel Copy Time: 0.0048ms
        Memcpy Time: 0.0054ms
    Test Size: 1000000
        Kernel Copy Time: 0.0193ms
        Memcpy Time: 0.0220ms
    Test Size: 10000000
        Kernel Copy Time: 0.1578ms
        Memcpy Time: 0.1537ms
    Test Size: 100000000
        Kernel Copy Time: 2.1156ms
        Memcpy Time: 1.5006ms
    Test Size: 250000000
        Kernel Copy Time: 5.5195ms
        Memcpy Time: 3.7424ms
    Test Size: 1000000000
        Kernel Copy Time: 23.2106ms
        Memcpy Time: 14.9483ms
    
    RTX 2080 Ti

    Test Size: 100000
        Kernel Copy Time: 0.0166ms
        Memcpy Time: 0.0188ms
    Test Size: 1000000
        Kernel Copy Time: 0.0580ms
        Memcpy Time: 0.0727ms
    Test Size: 10000000
        Kernel Copy Time: 0.4674ms
        Memcpy Time: 0.5047ms
    Test Size: 100000000
        Kernel Copy Time: 4.7992ms
        Memcpy Time: 3.7722ms
    Test Size: 250000000
        Kernel Copy Time: 7.2485ms
        Memcpy Time: 5.5863ms
    Test Size: 1000000000
        Kernel Copy Time: 31.5570ms
        Memcpy Time: 22.3184ms
    
    Test Size: 100000
        Kernel Copy Time: 0.0048ms
        Memcpy Time: 0.0054ms
    Test Size: 1000000
        Kernel Copy Time: 0.0193ms
        Memcpy Time: 0.0220ms
    Test Size: 10000000
        Kernel Copy Time: 0.1578ms
        Memcpy Time: 0.1537ms
    Test Size: 100000000
        Kernel Copy Time: 2.1156ms
        Memcpy Time: 1.5006ms
    Test Size: 250000000
        Kernel Copy Time: 5.5195ms
        Memcpy Time: 3.7424ms
    Test Size: 1000000000
        Kernel Copy Time: 23.2106ms
        Memcpy Time: 14.9483ms
    

    为什么您认为
    cudamemcpyDeviceToDevice
    的性能很差?我认为您不需要共享内存。您必须以元素方式复制两个数组,并且在第一个内核中没有线程协作。所以第一个内核就足够了?这是一个令人惊讶的答案。我投了赞成票。我看了你的网络档案,你在物理和化学方面有很多观点。我想知道你是否会支持我提出的一个建议。我一直在努力工作,但我们仍然需要更多的投稿人。
    Test Size: 100000
        Kernel Copy Time: 0.0048ms
        Memcpy Time: 0.0054ms
    Test Size: 1000000
        Kernel Copy Time: 0.0193ms
        Memcpy Time: 0.0220ms
    Test Size: 10000000
        Kernel Copy Time: 0.1578ms
        Memcpy Time: 0.1537ms
    Test Size: 100000000
        Kernel Copy Time: 2.1156ms
        Memcpy Time: 1.5006ms
    Test Size: 250000000
        Kernel Copy Time: 5.5195ms
        Memcpy Time: 3.7424ms
    Test Size: 1000000000
        Kernel Copy Time: 23.2106ms
        Memcpy Time: 14.9483ms