Cuda IVP对英伟达开普勒建筑的效率和性能
引用NVIDIA提供的“开普勒调谐指南”: 还请注意,开普勒GPU可以利用ILP代替 线程/扭曲级并行(TLP)比费米GPU更容易实现 在我看来,下面的代码片段Cuda IVP对英伟达开普勒建筑的效率和性能,cuda,kepler,Cuda,Kepler,引用NVIDIA提供的“开普勒调谐指南”: 还请注意,开普勒GPU可以利用ILP代替 线程/扭曲级并行(TLP)比费米GPU更容易实现 在我看来,下面的代码片段 a = .....; a2 = f(a); a3 = g(a2); 可以改进如下 a = ...; b = ....; a2 = f(a); b2 = f(b); a3 = g(a2); b3 = g(b2); 因此,在我的项目中,我有一段代码如下(示例1) if(x
a = .....;
a2 = f(a);
a3 = g(a2);
可以改进如下
a = ...;
b = ....;
a2 = f(a);
b2 = f(b);
a3 = g(a2);
b3 = g(b2);
因此,在我的项目中,我有一段代码如下(示例1)
if(x
我将其重写如下(示例2)
if(x
在开普勒体系结构中,示例2将比示例1更高效,表现出更好的性能,是吗?关于指令级并行性的一个很好的解释可以在以下位置找到 Robert Crovella和Talonmes已经指出,你自己也承认,你上面的例子没有达到ILP 关于如何实现ILP,我将在下面展示一个经典示例,它是从PyCUDA代码翻译而来的,我已经对费米和开普勒GPU进行了测试。请注意,对于后一种情况,我没有观察到相关的加速 代码
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE 64
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/************************************/
/* NO INSTRUCTION LEVEL PARALLELISM */
/************************************/
__global__ void ILP0(float* d_a, float* d_b, float* d_c) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
d_c[i] = d_a[i] + d_b[i];
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X2 */
/************************************/
__global__ void ILP2(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X4 */
/************************************/
__global__ void ILP4(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
int k = j + stride;
float ak = d_a[k];
float bk = d_b[k];
int l = k + stride;
float al = d_a[l];
float bl = d_b[l];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
float ck = ak + bk;
float cl = al + bl;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
d_c[k] = ck;
d_c[l] = cl;
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X8 */
/************************************/
__global__ void ILP8(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
int k = j + stride;
float ak = d_a[k];
float bk = d_b[k];
int l = k + stride;
float al = d_a[l];
float bl = d_b[l];
int m = l + stride;
float am = d_a[m];
float bm = d_b[m];
int n = m + stride;
float an = d_a[n];
float bn = d_b[n];
int p = n + stride;
float ap = d_a[p];
float bp = d_b[p];
int q = p + stride;
float aq = d_a[q];
float bq = d_b[q];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
float ck = ak + bk;
float cl = al + bl;
float cm = am + bm;
float cn = an + bn;
float cp = ap + bp;
float cq = aq + bq;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
d_c[k] = ck;
d_c[l] = cl;
d_c[m] = cm;
d_c[n] = cn;
d_c[p] = cp;
d_c[q] = cq;
}
/********/
/* MAIN */
/********/
void main() {
float timing;
cudaEvent_t start, stop;
const int N = 65536*4; // --- ASSUMPTION: N can be divided by BLOCKSIZE
float* a = (float*)malloc(N*sizeof(float));
float* b = (float*)malloc(N*sizeof(float));
float* c = (float*)malloc(N*sizeof(float));
float* c_ref = (float*)malloc(N*sizeof(float));
srand(time(NULL));
for (int i=0; i<N; i++) {
a[i] = rand() / RAND_MAX;
b[i] = rand() / RAND_MAX;
c_ref[i] = a[i] + b[i];
}
float* d_a; gpuErrchk(cudaMalloc((void**)&d_a,N*sizeof(float)));
float* d_b; gpuErrchk(cudaMalloc((void**)&d_b,N*sizeof(float)));
float* d_c0; gpuErrchk(cudaMalloc((void**)&d_c0,N*sizeof(float)));
float* d_c2; gpuErrchk(cudaMalloc((void**)&d_c2,N*sizeof(float)));
float* d_c4; gpuErrchk(cudaMalloc((void**)&d_c4,N*sizeof(float)));
float* d_c8; gpuErrchk(cudaMalloc((void**)&d_c8,N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice));
/******************/
/* ILP0 TEST CASE */
/******************/
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
ILP0<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(d_a, d_b, d_c0);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP0: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c0, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP2 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP2<<<(N/2)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c2);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP2: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c2, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP4 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP4<<<(N/4)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c4);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP4: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c4, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP8 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP8<<<(N/8)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c8);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP8: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c8, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("%f %f\n",c[i],c_ref[i]);
printf("Error!\n");
return;
}
printf("Test passed!\n");
}
Card Kernel Time [ms] Speedup
GeForce GT540M ILP0 4.609 1
" ILP2 2.666 1.72
" ILP4 1.675 2.76
" ILP8 1.477 3.12
Kepler K20c ILP0 0.045
" ILP2 0.043
" ILP4 0.043
" ILP8 0.042
有关指令级并行性(ILP)的详细说明,请参见 Robert Crovella和Talonmes已经指出,你自己也承认,你上面的例子没有达到ILP 关于如何实现ILP,我将在下面展示一个经典示例,它是从PyCUDA代码翻译而来的,我已经对费米和开普勒GPU进行了测试。请注意,对于后一种情况,我没有观察到相关的加速 代码
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE 64
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/************************************/
/* NO INSTRUCTION LEVEL PARALLELISM */
/************************************/
__global__ void ILP0(float* d_a, float* d_b, float* d_c) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
d_c[i] = d_a[i] + d_b[i];
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X2 */
/************************************/
__global__ void ILP2(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X4 */
/************************************/
__global__ void ILP4(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
int k = j + stride;
float ak = d_a[k];
float bk = d_b[k];
int l = k + stride;
float al = d_a[l];
float bl = d_b[l];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
float ck = ak + bk;
float cl = al + bl;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
d_c[k] = ck;
d_c[l] = cl;
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X8 */
/************************************/
__global__ void ILP8(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
int k = j + stride;
float ak = d_a[k];
float bk = d_b[k];
int l = k + stride;
float al = d_a[l];
float bl = d_b[l];
int m = l + stride;
float am = d_a[m];
float bm = d_b[m];
int n = m + stride;
float an = d_a[n];
float bn = d_b[n];
int p = n + stride;
float ap = d_a[p];
float bp = d_b[p];
int q = p + stride;
float aq = d_a[q];
float bq = d_b[q];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
float ck = ak + bk;
float cl = al + bl;
float cm = am + bm;
float cn = an + bn;
float cp = ap + bp;
float cq = aq + bq;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
d_c[k] = ck;
d_c[l] = cl;
d_c[m] = cm;
d_c[n] = cn;
d_c[p] = cp;
d_c[q] = cq;
}
/********/
/* MAIN */
/********/
void main() {
float timing;
cudaEvent_t start, stop;
const int N = 65536*4; // --- ASSUMPTION: N can be divided by BLOCKSIZE
float* a = (float*)malloc(N*sizeof(float));
float* b = (float*)malloc(N*sizeof(float));
float* c = (float*)malloc(N*sizeof(float));
float* c_ref = (float*)malloc(N*sizeof(float));
srand(time(NULL));
for (int i=0; i<N; i++) {
a[i] = rand() / RAND_MAX;
b[i] = rand() / RAND_MAX;
c_ref[i] = a[i] + b[i];
}
float* d_a; gpuErrchk(cudaMalloc((void**)&d_a,N*sizeof(float)));
float* d_b; gpuErrchk(cudaMalloc((void**)&d_b,N*sizeof(float)));
float* d_c0; gpuErrchk(cudaMalloc((void**)&d_c0,N*sizeof(float)));
float* d_c2; gpuErrchk(cudaMalloc((void**)&d_c2,N*sizeof(float)));
float* d_c4; gpuErrchk(cudaMalloc((void**)&d_c4,N*sizeof(float)));
float* d_c8; gpuErrchk(cudaMalloc((void**)&d_c8,N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice));
/******************/
/* ILP0 TEST CASE */
/******************/
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
ILP0<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(d_a, d_b, d_c0);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP0: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c0, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP2 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP2<<<(N/2)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c2);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP2: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c2, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP4 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP4<<<(N/4)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c4);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP4: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c4, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP8 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP8<<<(N/8)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c8);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP8: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c8, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("%f %f\n",c[i],c_ref[i]);
printf("Error!\n");
return;
}
printf("Test passed!\n");
}
Card Kernel Time [ms] Speedup
GeForce GT540M ILP0 4.609 1
" ILP2 2.666 1.72
" ILP4 1.675 2.76
" ILP8 1.477 3.12
Kepler K20c ILP0 0.045
" ILP2 0.043
" ILP4 0.043
" ILP8 0.042
为什么不测试一下并找出答案呢?比较这两种情况应该不难。我这样说的原因是,你的论文似乎假设编译器没有做任何“智能”的事情。但编译器实际上意识到了ILP的需求,并会积极尝试重新组织代码以启用ILP。在cc3.0和更新的体系结构上尤其如此,在这些体系结构中,编译器在低级指令排序中扮演着重要角色。当然,编译器可以很容易地发现您在第一次比较中暗示的
a
和b
之间的独立性,并重新排序。感谢您的回复。我已经在我的GTX 780上对其进行了测试,两个示例之间没有明显的差异。事实上,我想知道如何实现“ILP”为了在开普勒GPU中获得更好的性能,我对我的kenerl代码进行了修改。这个问题的整个前提是有缺陷的。您似乎认为make_short4
是一条指令。事实并非如此。实际上,您所做的只是内联一个函数,这是编译器无论如何都要做的事情。ILP来自哪里?为什么不测试它并找出答案?比较这两种情况应该不难。我这样说的原因是,你的论文似乎假设编译器没有做任何“智能”的事情。但编译器实际上意识到了ILP的需求,并会积极尝试重新组织代码以启用ILP。在cc3.0和更新的体系结构上尤其如此,在这些体系结构中,编译器在低级指令排序中扮演着重要角色。当然,编译器可以很容易地发现您在第一次比较中暗示的a
和b
之间的独立性,并重新排序。感谢您的回复。我已经在我的GTX 780上对其进行了测试,两个示例之间没有明显的差异。事实上,我想知道如何实现“ILP”为了在开普勒GPU中获得更好的性能,我对我的kenerl代码进行了修改。这个问题的整个前提是有缺陷的。您似乎认为make_short4
是一条指令。事实并非如此。实际上,您所做的只是内联一个函数,这是编译器无论如何都要做的事情。ILP来自哪里?