Cuda IVP对英伟达开普勒建筑的效率和性能

Cuda IVP对英伟达开普勒建筑的效率和性能,cuda,kepler,Cuda,Kepler,引用NVIDIA提供的“开普勒调谐指南”: 还请注意,开普勒GPU可以利用ILP代替 线程/扭曲级并行(TLP)比费米GPU更容易实现 在我看来,下面的代码片段 a = .....; a2 = f(a); a3 = g(a2); 可以改进如下 a = ...; b = ....; a2 = f(a); b2 = f(b); a3 = g(a2); b3 = g(b2); 因此,在我的项目中,我有一段代码如下(示例1) if(x

引用NVIDIA提供的“开普勒调谐指南”:

还请注意,开普勒GPU可以利用ILP代替 线程/扭曲级并行(TLP)比费米GPU更容易实现

在我看来,下面的代码片段

a = .....;
a2 = f(a); 
a3 = g(a2);  
可以改进如下

a = ...;
b = ....;
a2 = f(a);
b2 = f(b);
a3 = g(a2);
b3 = g(b2);
因此,在我的项目中,我有一段代码如下(示例1)

if(x
我将其重写如下(示例2)

if(x

在开普勒体系结构中,示例2将比示例1更高效,表现出更好的性能,是吗?

关于指令级并行性的一个很好的解释可以在以下位置找到

Robert Crovella和Talonmes已经指出,你自己也承认,你上面的例子没有达到ILP

关于如何实现ILP,我将在下面展示一个经典示例,它是从PyCUDA代码翻译而来的,我已经对费米和开普勒GPU进行了测试。请注意,对于后一种情况,我没有观察到相关的加速

代码

#include <stdio.h>
#include <time.h>

#define BLOCKSIZE 64

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){
    return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/************************************/
/* NO INSTRUCTION LEVEL PARALLELISM */
/************************************/
__global__ void ILP0(float* d_a, float* d_b, float* d_c) {

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    d_c[i] = d_a[i] + d_b[i];

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X2 */
/************************************/
__global__ void ILP2(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X4 */
/************************************/
__global__ void ILP4(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    int k = j + stride;
    float ak = d_a[k];
    float bk = d_b[k];

    int l = k + stride;
    float al = d_a[l];
    float bl = d_b[l];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;
    float ck = ak + bk;
    float cl = al + bl;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;
    d_c[k] = ck;
    d_c[l] = cl;

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X8 */
/************************************/
__global__ void ILP8(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    int k = j + stride;
    float ak = d_a[k];
    float bk = d_b[k];

    int l = k + stride;
    float al = d_a[l];
    float bl = d_b[l];

    int m = l + stride;
    float am = d_a[m];
    float bm = d_b[m];

    int n = m + stride;
    float an = d_a[n];
    float bn = d_b[n];

    int p = n + stride;
    float ap = d_a[p];
    float bp = d_b[p];

    int q = p + stride;
    float aq = d_a[q];
    float bq = d_b[q];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;
    float ck = ak + bk;
    float cl = al + bl;
    float cm = am + bm;
    float cn = an + bn;
    float cp = ap + bp;
    float cq = aq + bq;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;
    d_c[k] = ck;
    d_c[l] = cl;
    d_c[m] = cm;
    d_c[n] = cn;
    d_c[p] = cp;
    d_c[q] = cq;

}

/********/
/* MAIN */
/********/
void main() {

    float timing;
    cudaEvent_t start, stop;

    const int N = 65536*4; // --- ASSUMPTION: N can be divided by BLOCKSIZE

    float* a = (float*)malloc(N*sizeof(float));
    float* b = (float*)malloc(N*sizeof(float));
    float* c = (float*)malloc(N*sizeof(float));
    float* c_ref = (float*)malloc(N*sizeof(float));

    srand(time(NULL));
    for (int i=0; i<N; i++) {

        a[i] = rand() / RAND_MAX;
        b[i] = rand() / RAND_MAX;
        c_ref[i] = a[i] + b[i];

    }

    float* d_a; gpuErrchk(cudaMalloc((void**)&d_a,N*sizeof(float)));
    float* d_b; gpuErrchk(cudaMalloc((void**)&d_b,N*sizeof(float)));
    float* d_c0; gpuErrchk(cudaMalloc((void**)&d_c0,N*sizeof(float)));
    float* d_c2; gpuErrchk(cudaMalloc((void**)&d_c2,N*sizeof(float)));
    float* d_c4; gpuErrchk(cudaMalloc((void**)&d_c4,N*sizeof(float)));
    float* d_c8; gpuErrchk(cudaMalloc((void**)&d_c8,N*sizeof(float)));

    gpuErrchk(cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice));

    /******************/
    /* ILP0 TEST CASE */
    /******************/
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    ILP0<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(d_a, d_b, d_c0);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP0:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c0, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP2 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP2<<<(N/2)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c2);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP2:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c2, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP4 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP4<<<(N/4)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c4);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP4:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c4, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP8 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP8<<<(N/8)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c8);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP8:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c8, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("%f %f\n",c[i],c_ref[i]);
            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

}
Card                    Kernel          Time [ms]            Speedup
GeForce GT540M          ILP0            4.609                1
      "                 ILP2            2.666                1.72
      "                 ILP4            1.675                2.76
      "                 ILP8            1.477                3.12

Kepler K20c             ILP0            0.045                
      "                 ILP2            0.043                
      "                 ILP4            0.043                
      "                 ILP8            0.042                

有关指令级并行性(ILP)的详细说明,请参见

Robert Crovella和Talonmes已经指出,你自己也承认,你上面的例子没有达到ILP

关于如何实现ILP,我将在下面展示一个经典示例,它是从PyCUDA代码翻译而来的,我已经对费米和开普勒GPU进行了测试。请注意,对于后一种情况,我没有观察到相关的加速

代码

#include <stdio.h>
#include <time.h>

#define BLOCKSIZE 64

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){
    return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/************************************/
/* NO INSTRUCTION LEVEL PARALLELISM */
/************************************/
__global__ void ILP0(float* d_a, float* d_b, float* d_c) {

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    d_c[i] = d_a[i] + d_b[i];

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X2 */
/************************************/
__global__ void ILP2(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X4 */
/************************************/
__global__ void ILP4(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    int k = j + stride;
    float ak = d_a[k];
    float bk = d_b[k];

    int l = k + stride;
    float al = d_a[l];
    float bl = d_b[l];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;
    float ck = ak + bk;
    float cl = al + bl;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;
    d_c[k] = ck;
    d_c[l] = cl;

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X8 */
/************************************/
__global__ void ILP8(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    int k = j + stride;
    float ak = d_a[k];
    float bk = d_b[k];

    int l = k + stride;
    float al = d_a[l];
    float bl = d_b[l];

    int m = l + stride;
    float am = d_a[m];
    float bm = d_b[m];

    int n = m + stride;
    float an = d_a[n];
    float bn = d_b[n];

    int p = n + stride;
    float ap = d_a[p];
    float bp = d_b[p];

    int q = p + stride;
    float aq = d_a[q];
    float bq = d_b[q];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;
    float ck = ak + bk;
    float cl = al + bl;
    float cm = am + bm;
    float cn = an + bn;
    float cp = ap + bp;
    float cq = aq + bq;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;
    d_c[k] = ck;
    d_c[l] = cl;
    d_c[m] = cm;
    d_c[n] = cn;
    d_c[p] = cp;
    d_c[q] = cq;

}

/********/
/* MAIN */
/********/
void main() {

    float timing;
    cudaEvent_t start, stop;

    const int N = 65536*4; // --- ASSUMPTION: N can be divided by BLOCKSIZE

    float* a = (float*)malloc(N*sizeof(float));
    float* b = (float*)malloc(N*sizeof(float));
    float* c = (float*)malloc(N*sizeof(float));
    float* c_ref = (float*)malloc(N*sizeof(float));

    srand(time(NULL));
    for (int i=0; i<N; i++) {

        a[i] = rand() / RAND_MAX;
        b[i] = rand() / RAND_MAX;
        c_ref[i] = a[i] + b[i];

    }

    float* d_a; gpuErrchk(cudaMalloc((void**)&d_a,N*sizeof(float)));
    float* d_b; gpuErrchk(cudaMalloc((void**)&d_b,N*sizeof(float)));
    float* d_c0; gpuErrchk(cudaMalloc((void**)&d_c0,N*sizeof(float)));
    float* d_c2; gpuErrchk(cudaMalloc((void**)&d_c2,N*sizeof(float)));
    float* d_c4; gpuErrchk(cudaMalloc((void**)&d_c4,N*sizeof(float)));
    float* d_c8; gpuErrchk(cudaMalloc((void**)&d_c8,N*sizeof(float)));

    gpuErrchk(cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice));

    /******************/
    /* ILP0 TEST CASE */
    /******************/
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    ILP0<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(d_a, d_b, d_c0);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP0:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c0, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP2 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP2<<<(N/2)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c2);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP2:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c2, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP4 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP4<<<(N/4)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c4);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP4:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c4, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP8 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP8<<<(N/8)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c8);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP8:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c8, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("%f %f\n",c[i],c_ref[i]);
            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

}
Card                    Kernel          Time [ms]            Speedup
GeForce GT540M          ILP0            4.609                1
      "                 ILP2            2.666                1.72
      "                 ILP4            1.675                2.76
      "                 ILP8            1.477                3.12

Kepler K20c             ILP0            0.045                
      "                 ILP2            0.043                
      "                 ILP4            0.043                
      "                 ILP8            0.042                

为什么不测试一下并找出答案呢?比较这两种情况应该不难。我这样说的原因是,你的论文似乎假设编译器没有做任何“智能”的事情。但编译器实际上意识到了ILP的需求,并会积极尝试重新组织代码以启用ILP。在cc3.0和更新的体系结构上尤其如此,在这些体系结构中,编译器在低级指令排序中扮演着重要角色。当然,编译器可以很容易地发现您在第一次比较中暗示的
a
b
之间的独立性,并重新排序。感谢您的回复。我已经在我的GTX 780上对其进行了测试,两个示例之间没有明显的差异。事实上,我想知道如何实现“ILP”为了在开普勒GPU中获得更好的性能,我对我的kenerl代码进行了修改。这个问题的整个前提是有缺陷的。您似乎认为
make_short4
是一条指令。事实并非如此。实际上,您所做的只是内联一个函数,这是编译器无论如何都要做的事情。ILP来自哪里?为什么不测试它并找出答案?比较这两种情况应该不难。我这样说的原因是,你的论文似乎假设编译器没有做任何“智能”的事情。但编译器实际上意识到了ILP的需求,并会积极尝试重新组织代码以启用ILP。在cc3.0和更新的体系结构上尤其如此,在这些体系结构中,编译器在低级指令排序中扮演着重要角色。当然,编译器可以很容易地发现您在第一次比较中暗示的
a
b
之间的独立性,并重新排序。感谢您的回复。我已经在我的GTX 780上对其进行了测试,两个示例之间没有明显的差异。事实上,我想知道如何实现“ILP”为了在开普勒GPU中获得更好的性能,我对我的kenerl代码进行了修改。这个问题的整个前提是有缺陷的。您似乎认为
make_short4
是一条指令。事实并非如此。实际上,您所做的只是内联一个函数,这是编译器无论如何都要做的事情。ILP来自哪里?