Performance 两个相同的空内核上奇怪的cuda性能。
我有两个“空”内核,每个内核都有一个永远不会被触及的if语句Performance 两个相同的空内核上奇怪的cuda性能。,performance,cuda,Performance,Cuda,我有两个“空”内核,每个内核都有一个永远不会被触及的if语句 #include <cstdio> #include <time.h> #include <sys/time.h> #include <cuda.h> inline double wtime(){ double time[2]; struct timeval time1; gettimeofday(&time1, NULL); time[0]=time1.tv
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>
inline double wtime(){
double time[2];
struct timeval time1;
gettimeofday(&time1, NULL);
time[0]=time1.tv_sec;
time[1]=time1.tv_usec;
return time[0]+time[1]*1.0e-6;
}
__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}
int main(){
cudaDeviceSynchronize();
double s = wtime();
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty1: %.3f\n", 1000*(wtime()-s));
cudaDeviceSynchronize();
s = wtime();
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty2: %.3f\n", 1000*(wtime()-s));
return 0;
}
第一个空内核需要1ms,而第二个内核需要0.02ms
empty1: 1.075
empty2: 0.019
这很奇怪,因为这两个内核都没有进入分支。这两个内核的运行时应该是相同的
$ cat t1353.cu
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>
inline double wtime(){
double time[2];
struct timeval time1;
gettimeofday(&time1, NULL);
time[0]=time1.tv_sec;
time[1]=time1.tv_usec;
return time[0]+time[1]*1.0e-6;
}
__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}
int main(){
cudaDeviceSynchronize();
double s = wtime();
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty1: %.3f\n", 1000*(wtime()-s));
cudaDeviceSynchronize();
s = wtime();
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty2: %.3f\n", 1000*(wtime()-s));
return 0;
}
$ nvcc -arch=sm_35 -o t1353 t1353.cu
$ cuobjdump -sass t1353
Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_35
Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_35
Function : _Z8__empty2b
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x0800000000b81000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0018*/ EXIT; /* 0x18000000001c003c */
/*0020*/ BRA 0x20; /* 0x12007ffffc1c003c */
/*0028*/ NOP; /* 0x85800000001c3c02 */
/*0030*/ NOP; /* 0x85800000001c3c02 */
/*0038*/ NOP; /* 0x85800000001c3c02 */
.............................
Function : _Z8__empty1b
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08b8b0a0a0a0a000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ LDC.S8 R0, c[0x0][0x140]; /* 0x7c880000a01ffc02 */
/*0018*/ I2I.U16.S8 R0, R0; /* 0xe6000000001c8402 */
/*0020*/ LOP.AND R0, R0, 0xff; /* 0xc20000007f9c0001 */
/*0028*/ I2I.S32.S16 R0, R0; /* 0xe6000000001cd802 */
/*0030*/ ISETP.EQ.AND P0, PT, R0, RZ, PT; /* 0xdb281c007f9c001e */
/*0038*/ @P0 EXIT; /* 0x180000000000003c */
/* 0x08b810b800108010 */
/*0048*/ MOV32I R4, 0x0; /* 0x74000000001fc012 */
/*0050*/ MOV32I R5, 0x0; /* 0x74000000001fc016 */
/*0058*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*0060*/ MOV R6, RZ; /* 0xe4c03c007f9c001a */
/*0068*/ JCAL 0x0; /* 0x1100000000000100 */
/*0070*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0078*/ EXIT; /* 0x18000000001c003c */
/*0080*/ BRA 0x80; /* 0x12007ffffc1c003c */
/*0088*/ NOP; /* 0x85800000001c3c02 */
/*0090*/ NOP; /* 0x85800000001c3c02 */
/*0098*/ NOP; /* 0x85800000001c3c02 */
/*00a0*/ NOP; /* 0x85800000001c3c02 */
/*00a8*/ NOP; /* 0x85800000001c3c02 */
/*00b0*/ NOP; /* 0x85800000001c3c02 */
/*00b8*/ NOP; /* 0x85800000001c3c02 */
.............................
Fatbin ptx code:
================
arch = sm_35
code version = [5,0]
producer = cuda
host = linux
compile_size = 64bit
compressed
$
$ cat t1353.cu
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>
inline double wtime(){
double time[2];
struct timeval time1;
gettimeofday(&time1, NULL);
time[0]=time1.tv_sec;
time[1]=time1.tv_usec;
return time[0]+time[1]*1.0e-6;
}
__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}
int main(){
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
double s = wtime();
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty1: %.3f\n", 1000*(wtime()-s));
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
s = wtime();
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty2: %.3f\n", 1000*(wtime()-s));
return 0;
}
$ nvcc -arch=sm_35 -o t1353 t1353.cu
$ ./t1353
empty1: 0.023
empty2: 0.015
$
在empty1
情况下,时间更长:
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ LDC.S8 R0, c[0x0][0x140]; /* 0x7c880000a01ffc02 */
/*0018*/ I2I.U16.S8 R0, R0; /* 0xe6000000001c8402 */
/*0020*/ LOP.AND R0, R0, 0xff; /* 0xc20000007f9c0001 */
/*0028*/ I2I.S32.S16 R0, R0; /* 0xe6000000001cd802 */
/*0030*/ ISETP.EQ.AND P0, PT, R0, RZ, PT; /* 0xdb281c007f9c001e */
/*0038*/ @P0 EXIT;
...
/*0078*/ EXIT; /* 0x18000000001c003c */
empty1
运行“预热”调用,则两种情况之间的测量时间几乎相同。这可以用代码长度的差异来解释$ cat t1353.cu
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>
inline double wtime(){
double time[2];
struct timeval time1;
gettimeofday(&time1, NULL);
time[0]=time1.tv_sec;
time[1]=time1.tv_usec;
return time[0]+time[1]*1.0e-6;
}
__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}
int main(){
cudaDeviceSynchronize();
double s = wtime();
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty1: %.3f\n", 1000*(wtime()-s));
cudaDeviceSynchronize();
s = wtime();
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty2: %.3f\n", 1000*(wtime()-s));
return 0;
}
$ nvcc -arch=sm_35 -o t1353 t1353.cu
$ cuobjdump -sass t1353
Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_35
Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_35
Function : _Z8__empty2b
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x0800000000b81000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0018*/ EXIT; /* 0x18000000001c003c */
/*0020*/ BRA 0x20; /* 0x12007ffffc1c003c */
/*0028*/ NOP; /* 0x85800000001c3c02 */
/*0030*/ NOP; /* 0x85800000001c3c02 */
/*0038*/ NOP; /* 0x85800000001c3c02 */
.............................
Function : _Z8__empty1b
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08b8b0a0a0a0a000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ LDC.S8 R0, c[0x0][0x140]; /* 0x7c880000a01ffc02 */
/*0018*/ I2I.U16.S8 R0, R0; /* 0xe6000000001c8402 */
/*0020*/ LOP.AND R0, R0, 0xff; /* 0xc20000007f9c0001 */
/*0028*/ I2I.S32.S16 R0, R0; /* 0xe6000000001cd802 */
/*0030*/ ISETP.EQ.AND P0, PT, R0, RZ, PT; /* 0xdb281c007f9c001e */
/*0038*/ @P0 EXIT; /* 0x180000000000003c */
/* 0x08b810b800108010 */
/*0048*/ MOV32I R4, 0x0; /* 0x74000000001fc012 */
/*0050*/ MOV32I R5, 0x0; /* 0x74000000001fc016 */
/*0058*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*0060*/ MOV R6, RZ; /* 0xe4c03c007f9c001a */
/*0068*/ JCAL 0x0; /* 0x1100000000000100 */
/*0070*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0078*/ EXIT; /* 0x18000000001c003c */
/*0080*/ BRA 0x80; /* 0x12007ffffc1c003c */
/*0088*/ NOP; /* 0x85800000001c3c02 */
/*0090*/ NOP; /* 0x85800000001c3c02 */
/*0098*/ NOP; /* 0x85800000001c3c02 */
/*00a0*/ NOP; /* 0x85800000001c3c02 */
/*00a8*/ NOP; /* 0x85800000001c3c02 */
/*00b0*/ NOP; /* 0x85800000001c3c02 */
/*00b8*/ NOP; /* 0x85800000001c3c02 */
.............................
Fatbin ptx code:
================
arch = sm_35
code version = [5,0]
producer = cuda
host = linux
compile_size = 64bit
compressed
$
$ cat t1353.cu
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>
inline double wtime(){
double time[2];
struct timeval time1;
gettimeofday(&time1, NULL);
time[0]=time1.tv_sec;
time[1]=time1.tv_usec;
return time[0]+time[1]*1.0e-6;
}
__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}
int main(){
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
double s = wtime();
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty1: %.3f\n", 1000*(wtime()-s));
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
s = wtime();
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty2: %.3f\n", 1000*(wtime()-s));
return 0;
}
$ nvcc -arch=sm_35 -o t1353 t1353.cu
$ ./t1353
empty1: 0.023
empty2: 0.015
$
$cat t1353.cu
#包括
#包括
#包括
#包括
内联双wtime(){
双倍时间[2];
结构timeval time1;
gettimeofday(&time1,NULL);
时间[0]=时间1.tv_秒;
时间[1]=时间1.tv_usec;
返回时间[0]+时间[1]*1.0e-6;
}
__全局无效空1(bool标志){if(标志){printf(“hh\n”);}
__全局无效清空2(bool标志){if(标志){;}
int main(){
__空1(假);
cudaDeviceSynchronize();
双s=wtime();
__空1(假);
cudaDeviceSynchronize();
printf(“空1:%.3f\n”,1000*(wtime()-s));
__空2(假);
cudaDeviceSynchronize();
s=wtime();
__空2(假);
cudaDeviceSynchronize();
printf(“empty2:%.3f\n”,1000*(wtime()-s));
返回0;
}
$nvcc-arch=sm_35-o t1353 t1353.cu
美元/t1353
空1:0.023
清空2:0.015
$
是什么让您认为运行时间是相同的?printf可能有很多无条件包含和执行的支持代码。除非您已经反汇编并检查了GPU在这两种情况下运行的实际SASS代码,否则您无法做出这样的假设