Cuda NVIDIA GPU上的指令级并行(ILP)和无序执行

Cuda NVIDIA GPU上的指令级并行(ILP)和无序执行,cuda,nvidia,Cuda,Nvidia,NVIDIA GPU是否支持无序执行 我的第一个猜测是,它们不包含如此昂贵的硬件。但是,在阅读《CUDA编程指南》时,该指南建议使用指令级并行(ILP)来提高性能 ILP不是支持无序执行的硬件可以利用的功能吗?或者NVIDIA的ILP仅仅意味着编译器级指令的重新排序,因此其顺序在运行时仍然是固定的。换句话说,只是编译器和/或程序员必须安排指令的顺序,以便在运行时通过顺序执行实现ILP?流水线是一种常见的ILP技术,并且肯定是在NVidia的GPU上实现的。我想你也同意流水线不依赖于无序执行。 此

NVIDIA GPU是否支持无序执行

我的第一个猜测是,它们不包含如此昂贵的硬件。但是,在阅读《CUDA编程指南》时,该指南建议使用指令级并行(ILP)来提高性能


ILP不是支持无序执行的硬件可以利用的功能吗?或者NVIDIA的ILP仅仅意味着编译器级指令的重新排序,因此其顺序在运行时仍然是固定的。换句话说,只是编译器和/或程序员必须安排指令的顺序,以便在运行时通过顺序执行实现ILP?

流水线是一种常见的ILP技术,并且肯定是在NVidia的GPU上实现的。我想你也同意流水线不依赖于无序执行。 此外,NVidia GPU有多个compute capability 2.0及更高版本(2或4)的warp调度程序。如果您的代码在线程中有2条(或更多)连续且独立的指令(或编译器以某种方式对其重新排序),那么您也可以利用调度程序中的ILP

下面是一个关于2-wide warp scheduler+流水线如何协同工作的详细解释问题。

还可以查看Vasily Volkov在GTC 2010上的演示。他通过实验发现ILP将如何提高CUDA代码的性能


就GPU上的无序执行而言,我不这么认为。正如您所知,硬件指令重新排序、推测性执行所有这些东西对于每个SM来说都太昂贵了。线程级并行可以弥补缺少无序执行的缺陷。当遇到真正的依赖关系时,其他一些扭曲可能会进入并填满管道。

下面的代码报告了指令级并行(ILP)的示例

示例中的
\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
函数只是在两个数组之间执行赋值。对于
ILP=1
的情况,我们的线程数量与数组元素
N
的数量相同,因此每个线程执行一个赋值。与此相反,对于
ILP=2
的情况,我们有许多
N/2
线程,每个线程处理
2
元素。通常,对于
ILP=k
的情况,我们有许多
N/k
线程,每个线程处理
k
元素

除了代码,下面我还报告了在
NVIDIA GT920M
(开普勒体系结构)上针对
N
ILP
的不同值执行的计时。可以看出:

  • 对于较大的
    N
    ,达到接近
    GT920M
    卡的最大存储带宽,即
    14.4GB/s
  • 对于任何固定的
    N
    ,更改
    ILP
    的值不会改变性能
  • 关于第2点,我也在Maxwell上测试了相同的代码,并观察到了相同的行为(与
    ILP
    相比,性能没有变化)。有关针对
    ILP
    的性能变化,请参见报告费米体系结构测试的答案

    存储器速度通过以下公式计算:

    (2.f * 4.f * N * numITER) / (1e9 * timeTotal * 1e-3)
    
    在哪里

    是读取或写入的次数

    2.f * 4.f * N * numITER
    
    是读取和写入的数量

    timeTotal * 1e-3
    
    时间单位为秒(
    timeTotal
    单位为毫秒)

    代码

    // --- GT920m - 14.4 GB/s
    //     http://gpuboss.com/gpus/GeForce-GTX-280M-vs-GeForce-920M
    
    #include<stdio.h>
    #include<iostream>
    
    #include "Utilities.cuh"
    #include "TimingGPU.cuh"
    
    #define BLOCKSIZE    32
    
    #define DEBUG
    
    /****************************************/
    /* INSTRUCTION LEVEL PARALLELISM KERNEL */
    /****************************************/
    __global__ void ILPKernel(const int * __restrict__ d_a, int * __restrict__ d_b, const int ILP, const int N) {
    
        const int tid = threadIdx.x + blockIdx.x * blockDim.x * ILP;
    
        if (tid >= N) return;
    
        for (int j = 0; j < ILP; j++) d_b[tid + j * blockDim.x] = d_a[tid + j * blockDim.x];
    
    }
    
    /********/
    /* MAIN */
    /********/
    int main() {
    
        //const int N = 8192;
        const int N = 524288 * 32;
        //const int N = 1048576;
        //const int N = 262144;
        //const int N = 2048;
    
        const int numITER = 100;
    
        const int ILP = 16;
    
        TimingGPU timerGPU;
    
        int *h_a = (int *)malloc(N * sizeof(int));
        int *h_b = (int *)malloc(N * sizeof(int));
    
        for (int i = 0; i<N; i++) {
            h_a[i] = 2;
            h_b[i] = 1;
        }
    
        int *d_a; gpuErrchk(cudaMalloc(&d_a, N * sizeof(int)));
        int *d_b; gpuErrchk(cudaMalloc(&d_b, N * sizeof(int)));
    
        gpuErrchk(cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice));
    
        /**************/
        /* ILP KERNEL */
        /**************/
        float timeTotal = 0.f;
        for (int k = 0; k < numITER; k++) {
            timerGPU.StartCounter();
            ILPKernel << <iDivUp(N / ILP, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, ILP, N);
    #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif
            timeTotal = timeTotal + timerGPU.GetCounter();
        }
    
        printf("Bandwidth = %f GB / s; Num blocks = %d\n", (2.f * 4.f * N * numITER) / (1e6 * timeTotal), iDivUp(N / ILP, BLOCKSIZE));
        gpuErrchk(cudaMemcpy(h_b, d_b, N * sizeof(int), cudaMemcpyDeviceToHost));
        for (int i = 0; i < N; i++) if (h_a[i] != h_b[i]) { printf("Error at i = %i for kernel0! Host = %i; Device = %i\n", i, h_a[i], h_b[i]); return 1; }
    
        return 0;
    
    }
    
    GT 920M
    N = 512  - ILP = 1  - BLOCKSIZE = 512 (1 block  - each block processes 512 elements)  - Bandwidth = 0.092 GB / s
    
    N = 1024 - ILP = 1  - BLOCKSIZE = 512 (2 blocks - each block processes 512 elements)  - Bandwidth = 0.15  GB / s
    
    N = 2048 - ILP = 1  - BLOCKSIZE = 512 (4 blocks - each block processes 512 elements)  - Bandwidth = 0.37  GB / s
    N = 2048 - ILP = 2  - BLOCKSIZE = 256 (4 blocks - each block processes 512 elements)  - Bandwidth = 0.36  GB / s
    N = 2048 - ILP = 4  - BLOCKSIZE = 128 (4 blocks - each block processes 512 elements)  - Bandwidth = 0.35  GB / s
    N = 2048 - ILP = 8  - BLOCKSIZE =  64 (4 blocks - each block processes 512 elements)  - Bandwidth = 0.26  GB / s
    N = 2048 - ILP = 16 - BLOCKSIZE =  32 (4 blocks - each block processes 512 elements)  - Bandwidth = 0.31  GB / s
    
    N = 4096 - ILP = 1  - BLOCKSIZE = 512 (8 blocks - each block processes 512 elements)  - Bandwidth = 0.53  GB / s
    N = 4096 - ILP = 2  - BLOCKSIZE = 256 (8 blocks - each block processes 512 elements)  - Bandwidth = 0.61  GB / s
    N = 4096 - ILP = 4  - BLOCKSIZE = 128 (8 blocks - each block processes 512 elements)  - Bandwidth = 0.74  GB / s
    N = 4096 - ILP = 8  - BLOCKSIZE =  64 (8 blocks - each block processes 512 elements)  - Bandwidth = 0.74  GB / s
    N = 4096 - ILP = 16 - BLOCKSIZE =  32 (8 blocks - each block processes 512 elements)  - Bandwidth = 0.56  GB / s
    
    N = 8192 - ILP = 1  - BLOCKSIZE = 512 (16 blocks - each block processes 512 elements) - Bandwidth = 1.4  GB / s
    N = 8192 - ILP = 2  - BLOCKSIZE = 256 (16 blocks - each block processes 512 elements) - Bandwidth = 1.1  GB / s
    N = 8192 - ILP = 4  - BLOCKSIZE = 128 (16 blocks - each block processes 512 elements) - Bandwidth = 1.5  GB / s
    N = 8192 - ILP = 8  - BLOCKSIZE =  64 (16 blocks - each block processes 512 elements) - Bandwidth = 1.4  GB / s
    N = 8192 - ILP = 16 - BLOCKSIZE =  32 (16 blocks - each block processes 512 elements) - Bandwidth = 1.3  GB / s
    
    ...
    
    N = 16777216 - ILP = 1  - BLOCKSIZE = 512 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.9  GB / s
    N = 16777216 - ILP = 2  - BLOCKSIZE = 256 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.8  GB / s
    N = 16777216 - ILP = 4  - BLOCKSIZE = 128 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.8  GB / s
    N = 16777216 - ILP = 8  - BLOCKSIZE =  64 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.7  GB / s
    N = 16777216 - ILP = 16 - BLOCKSIZE =  32 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.6  GB / s
    

    利用指令级并行性不需要无序处理器。具有超标量执行的有序处理器也可以从中受益。
    GT 920M
    N = 512  - ILP = 1  - BLOCKSIZE = 512 (1 block  - each block processes 512 elements)  - Bandwidth = 0.092 GB / s
    
    N = 1024 - ILP = 1  - BLOCKSIZE = 512 (2 blocks - each block processes 512 elements)  - Bandwidth = 0.15  GB / s
    
    N = 2048 - ILP = 1  - BLOCKSIZE = 512 (4 blocks - each block processes 512 elements)  - Bandwidth = 0.37  GB / s
    N = 2048 - ILP = 2  - BLOCKSIZE = 256 (4 blocks - each block processes 512 elements)  - Bandwidth = 0.36  GB / s
    N = 2048 - ILP = 4  - BLOCKSIZE = 128 (4 blocks - each block processes 512 elements)  - Bandwidth = 0.35  GB / s
    N = 2048 - ILP = 8  - BLOCKSIZE =  64 (4 blocks - each block processes 512 elements)  - Bandwidth = 0.26  GB / s
    N = 2048 - ILP = 16 - BLOCKSIZE =  32 (4 blocks - each block processes 512 elements)  - Bandwidth = 0.31  GB / s
    
    N = 4096 - ILP = 1  - BLOCKSIZE = 512 (8 blocks - each block processes 512 elements)  - Bandwidth = 0.53  GB / s
    N = 4096 - ILP = 2  - BLOCKSIZE = 256 (8 blocks - each block processes 512 elements)  - Bandwidth = 0.61  GB / s
    N = 4096 - ILP = 4  - BLOCKSIZE = 128 (8 blocks - each block processes 512 elements)  - Bandwidth = 0.74  GB / s
    N = 4096 - ILP = 8  - BLOCKSIZE =  64 (8 blocks - each block processes 512 elements)  - Bandwidth = 0.74  GB / s
    N = 4096 - ILP = 16 - BLOCKSIZE =  32 (8 blocks - each block processes 512 elements)  - Bandwidth = 0.56  GB / s
    
    N = 8192 - ILP = 1  - BLOCKSIZE = 512 (16 blocks - each block processes 512 elements) - Bandwidth = 1.4  GB / s
    N = 8192 - ILP = 2  - BLOCKSIZE = 256 (16 blocks - each block processes 512 elements) - Bandwidth = 1.1  GB / s
    N = 8192 - ILP = 4  - BLOCKSIZE = 128 (16 blocks - each block processes 512 elements) - Bandwidth = 1.5  GB / s
    N = 8192 - ILP = 8  - BLOCKSIZE =  64 (16 blocks - each block processes 512 elements) - Bandwidth = 1.4  GB / s
    N = 8192 - ILP = 16 - BLOCKSIZE =  32 (16 blocks - each block processes 512 elements) - Bandwidth = 1.3  GB / s
    
    ...
    
    N = 16777216 - ILP = 1  - BLOCKSIZE = 512 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.9  GB / s
    N = 16777216 - ILP = 2  - BLOCKSIZE = 256 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.8  GB / s
    N = 16777216 - ILP = 4  - BLOCKSIZE = 128 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.8  GB / s
    N = 16777216 - ILP = 8  - BLOCKSIZE =  64 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.7  GB / s
    N = 16777216 - ILP = 16 - BLOCKSIZE =  32 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.6  GB / s