Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/backbone.js/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Performance 两个相同的空内核上奇怪的cuda性能。_Performance_Cuda - Fatal编程技术网

Performance 两个相同的空内核上奇怪的cuda性能。

Performance 两个相同的空内核上奇怪的cuda性能。,performance,cuda,Performance,Cuda,我有两个“空”内核,每个内核都有一个永远不会被触及的if语句 #include <cstdio> #include <time.h> #include <sys/time.h> #include <cuda.h> inline double wtime(){ double time[2]; struct timeval time1; gettimeofday(&time1, NULL); time[0]=time1.tv

我有两个“空”内核,每个内核都有一个永远不会被触及的if语句

#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>

inline double wtime(){
  double time[2];
  struct timeval time1;
  gettimeofday(&time1, NULL);

  time[0]=time1.tv_sec;
  time[1]=time1.tv_usec;

  return time[0]+time[1]*1.0e-6;
}

__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}

int main(){

  cudaDeviceSynchronize();
  double s = wtime();
  __empty1<<<256,256>>>(false);
  cudaDeviceSynchronize();
  printf("empty1: %.3f\n", 1000*(wtime()-s));

  cudaDeviceSynchronize();
  s = wtime();
  __empty2<<<256,256>>>(false);
  cudaDeviceSynchronize();
  printf("empty2: %.3f\n", 1000*(wtime()-s));
  return 0;
}
第一个空内核需要1ms,而第二个内核需要0.02ms

empty1: 1.075
empty2: 0.019
这很奇怪,因为这两个内核都没有进入分支。这两个内核的运行时应该是相同的

  • 代码不一样。正如评论中指出的,通过查看SASS可以发现这一点
  • 例如:

    $ cat t1353.cu
    #include <cstdio>
    #include <time.h>
    #include <sys/time.h>
    #include <cuda.h>
    
    inline double wtime(){
      double time[2];
      struct timeval time1;
      gettimeofday(&time1, NULL);
    
      time[0]=time1.tv_sec;
      time[1]=time1.tv_usec;
    
      return time[0]+time[1]*1.0e-6;
    }
    
    __global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
    __global__ void __empty2(bool flag){if(flag){ ; }}
    
    int main(){
    
      cudaDeviceSynchronize();
      double s = wtime();
      __empty1<<<256,256>>>(false);
      cudaDeviceSynchronize();
      printf("empty1: %.3f\n", 1000*(wtime()-s));
    
      cudaDeviceSynchronize();
      s = wtime();
      __empty2<<<256,256>>>(false);
      cudaDeviceSynchronize();
      printf("empty2: %.3f\n", 1000*(wtime()-s));
      return 0;
    }
    $ nvcc -arch=sm_35 -o t1353 t1353.cu
    $ cuobjdump -sass t1353
    
    Fatbin elf code:
    ================
    arch = sm_35
    code version = [1,7]
    producer = <unknown>
    host = linux
    compile_size = 64bit
    
            code for sm_35
    
    Fatbin elf code:
    ================
    arch = sm_35
    code version = [1,7]
    producer = cuda
    host = linux
    compile_size = 64bit
    
            code for sm_35
                    Function : _Z8__empty2b
            .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                              /* 0x0800000000b81000 */
            /*0008*/                   MOV R1, c[0x0][0x44];  /* 0x64c03c00089c0006 */
            /*0010*/                   MOV RZ, RZ;            /* 0xe4c03c007f9c03fe */
            /*0018*/                   EXIT;                  /* 0x18000000001c003c */
            /*0020*/                   BRA 0x20;              /* 0x12007ffffc1c003c */
            /*0028*/                   NOP;                   /* 0x85800000001c3c02 */
            /*0030*/                   NOP;                   /* 0x85800000001c3c02 */
            /*0038*/                   NOP;                   /* 0x85800000001c3c02 */
                    .............................
    
    
                    Function : _Z8__empty1b
            .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                                         /* 0x08b8b0a0a0a0a000 */
            /*0008*/                   MOV R1, c[0x0][0x44];             /* 0x64c03c00089c0006 */
            /*0010*/                   LDC.S8 R0, c[0x0][0x140];         /* 0x7c880000a01ffc02 */
            /*0018*/                   I2I.U16.S8 R0, R0;                /* 0xe6000000001c8402 */
            /*0020*/                   LOP.AND R0, R0, 0xff;             /* 0xc20000007f9c0001 */
            /*0028*/                   I2I.S32.S16 R0, R0;               /* 0xe6000000001cd802 */
            /*0030*/                   ISETP.EQ.AND P0, PT, R0, RZ, PT;  /* 0xdb281c007f9c001e */
            /*0038*/               @P0 EXIT;                             /* 0x180000000000003c */
                                                                         /* 0x08b810b800108010 */
            /*0048*/                   MOV32I R4, 0x0;                   /* 0x74000000001fc012 */
            /*0050*/                   MOV32I R5, 0x0;                   /* 0x74000000001fc016 */
            /*0058*/                   MOV R7, RZ;                       /* 0xe4c03c007f9c001e */
            /*0060*/                   MOV R6, RZ;                       /* 0xe4c03c007f9c001a */
            /*0068*/                   JCAL 0x0;                         /* 0x1100000000000100 */
            /*0070*/                   MOV RZ, RZ;                       /* 0xe4c03c007f9c03fe */
            /*0078*/                   EXIT;                             /* 0x18000000001c003c */
            /*0080*/                   BRA 0x80;                         /* 0x12007ffffc1c003c */
            /*0088*/                   NOP;                              /* 0x85800000001c3c02 */
            /*0090*/                   NOP;                              /* 0x85800000001c3c02 */
            /*0098*/                   NOP;                              /* 0x85800000001c3c02 */
            /*00a0*/                   NOP;                              /* 0x85800000001c3c02 */
            /*00a8*/                   NOP;                              /* 0x85800000001c3c02 */
            /*00b0*/                   NOP;                              /* 0x85800000001c3c02 */
            /*00b8*/                   NOP;                              /* 0x85800000001c3c02 */
                    .............................
    
    
    
    Fatbin ptx code:
    ================
    arch = sm_35
    code version = [5,0]
    producer = cuda
    host = linux
    compile_size = 64bit
    compressed
    $
    
    $ cat t1353.cu
    #include <cstdio>
    #include <time.h>
    #include <sys/time.h>
    #include <cuda.h>
    
    inline double wtime(){
      double time[2];
      struct timeval time1;
      gettimeofday(&time1, NULL);
    
      time[0]=time1.tv_sec;
      time[1]=time1.tv_usec;
    
      return time[0]+time[1]*1.0e-6;
    }
    
    __global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
    __global__ void __empty2(bool flag){if(flag){ ; }}
    
    int main(){
    
      __empty1<<<256,256>>>(false);
      cudaDeviceSynchronize();
      double s = wtime();
      __empty1<<<256,256>>>(false);
      cudaDeviceSynchronize();
      printf("empty1: %.3f\n", 1000*(wtime()-s));
    
      __empty2<<<256,256>>>(false);
      cudaDeviceSynchronize();
      s = wtime();
      __empty2<<<256,256>>>(false);
      cudaDeviceSynchronize();
      printf("empty2: %.3f\n", 1000*(wtime()-s));
      return 0;
    }
    $ nvcc -arch=sm_35 -o t1353 t1353.cu
    $ ./t1353
    empty1: 0.023
    empty2: 0.015
    $
    
    empty1
    情况下,时间更长:

            /*0008*/                   MOV R1, c[0x0][0x44];             /* 0x64c03c00089c0006 */
            /*0010*/                   LDC.S8 R0, c[0x0][0x140];         /* 0x7c880000a01ffc02 */
            /*0018*/                   I2I.U16.S8 R0, R0;                /* 0xe6000000001c8402 */
            /*0020*/                   LOP.AND R0, R0, 0xff;             /* 0xc20000007f9c0001 */
            /*0028*/                   I2I.S32.S16 R0, R0;               /* 0xe6000000001cd802 */
            /*0030*/                   ISETP.EQ.AND P0, PT, R0, RZ, PT;  /* 0xdb281c007f9c001e */
            /*0038*/               @P0 EXIT;                            
    
    ...
            /*0078*/                   EXIT;                             /* 0x18000000001c003c */
    
  • 这里更大的问题可能是计时的严格性/正确性。CUDA具有延迟初始化。这意味着CUDA代码中的第一组调用可能会产生比通常更多的时间开销。根据我的测试,这影响了这里的结论。如果在实际计时之前对
    empty1
    运行“预热”调用,则两种情况之间的测量时间几乎相同。这可以用代码长度的差异来解释
  • 例如:

    $ cat t1353.cu
    #include <cstdio>
    #include <time.h>
    #include <sys/time.h>
    #include <cuda.h>
    
    inline double wtime(){
      double time[2];
      struct timeval time1;
      gettimeofday(&time1, NULL);
    
      time[0]=time1.tv_sec;
      time[1]=time1.tv_usec;
    
      return time[0]+time[1]*1.0e-6;
    }
    
    __global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
    __global__ void __empty2(bool flag){if(flag){ ; }}
    
    int main(){
    
      cudaDeviceSynchronize();
      double s = wtime();
      __empty1<<<256,256>>>(false);
      cudaDeviceSynchronize();
      printf("empty1: %.3f\n", 1000*(wtime()-s));
    
      cudaDeviceSynchronize();
      s = wtime();
      __empty2<<<256,256>>>(false);
      cudaDeviceSynchronize();
      printf("empty2: %.3f\n", 1000*(wtime()-s));
      return 0;
    }
    $ nvcc -arch=sm_35 -o t1353 t1353.cu
    $ cuobjdump -sass t1353
    
    Fatbin elf code:
    ================
    arch = sm_35
    code version = [1,7]
    producer = <unknown>
    host = linux
    compile_size = 64bit
    
            code for sm_35
    
    Fatbin elf code:
    ================
    arch = sm_35
    code version = [1,7]
    producer = cuda
    host = linux
    compile_size = 64bit
    
            code for sm_35
                    Function : _Z8__empty2b
            .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                              /* 0x0800000000b81000 */
            /*0008*/                   MOV R1, c[0x0][0x44];  /* 0x64c03c00089c0006 */
            /*0010*/                   MOV RZ, RZ;            /* 0xe4c03c007f9c03fe */
            /*0018*/                   EXIT;                  /* 0x18000000001c003c */
            /*0020*/                   BRA 0x20;              /* 0x12007ffffc1c003c */
            /*0028*/                   NOP;                   /* 0x85800000001c3c02 */
            /*0030*/                   NOP;                   /* 0x85800000001c3c02 */
            /*0038*/                   NOP;                   /* 0x85800000001c3c02 */
                    .............................
    
    
                    Function : _Z8__empty1b
            .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                                         /* 0x08b8b0a0a0a0a000 */
            /*0008*/                   MOV R1, c[0x0][0x44];             /* 0x64c03c00089c0006 */
            /*0010*/                   LDC.S8 R0, c[0x0][0x140];         /* 0x7c880000a01ffc02 */
            /*0018*/                   I2I.U16.S8 R0, R0;                /* 0xe6000000001c8402 */
            /*0020*/                   LOP.AND R0, R0, 0xff;             /* 0xc20000007f9c0001 */
            /*0028*/                   I2I.S32.S16 R0, R0;               /* 0xe6000000001cd802 */
            /*0030*/                   ISETP.EQ.AND P0, PT, R0, RZ, PT;  /* 0xdb281c007f9c001e */
            /*0038*/               @P0 EXIT;                             /* 0x180000000000003c */
                                                                         /* 0x08b810b800108010 */
            /*0048*/                   MOV32I R4, 0x0;                   /* 0x74000000001fc012 */
            /*0050*/                   MOV32I R5, 0x0;                   /* 0x74000000001fc016 */
            /*0058*/                   MOV R7, RZ;                       /* 0xe4c03c007f9c001e */
            /*0060*/                   MOV R6, RZ;                       /* 0xe4c03c007f9c001a */
            /*0068*/                   JCAL 0x0;                         /* 0x1100000000000100 */
            /*0070*/                   MOV RZ, RZ;                       /* 0xe4c03c007f9c03fe */
            /*0078*/                   EXIT;                             /* 0x18000000001c003c */
            /*0080*/                   BRA 0x80;                         /* 0x12007ffffc1c003c */
            /*0088*/                   NOP;                              /* 0x85800000001c3c02 */
            /*0090*/                   NOP;                              /* 0x85800000001c3c02 */
            /*0098*/                   NOP;                              /* 0x85800000001c3c02 */
            /*00a0*/                   NOP;                              /* 0x85800000001c3c02 */
            /*00a8*/                   NOP;                              /* 0x85800000001c3c02 */
            /*00b0*/                   NOP;                              /* 0x85800000001c3c02 */
            /*00b8*/                   NOP;                              /* 0x85800000001c3c02 */
                    .............................
    
    
    
    Fatbin ptx code:
    ================
    arch = sm_35
    code version = [5,0]
    producer = cuda
    host = linux
    compile_size = 64bit
    compressed
    $
    
    $ cat t1353.cu
    #include <cstdio>
    #include <time.h>
    #include <sys/time.h>
    #include <cuda.h>
    
    inline double wtime(){
      double time[2];
      struct timeval time1;
      gettimeofday(&time1, NULL);
    
      time[0]=time1.tv_sec;
      time[1]=time1.tv_usec;
    
      return time[0]+time[1]*1.0e-6;
    }
    
    __global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
    __global__ void __empty2(bool flag){if(flag){ ; }}
    
    int main(){
    
      __empty1<<<256,256>>>(false);
      cudaDeviceSynchronize();
      double s = wtime();
      __empty1<<<256,256>>>(false);
      cudaDeviceSynchronize();
      printf("empty1: %.3f\n", 1000*(wtime()-s));
    
      __empty2<<<256,256>>>(false);
      cudaDeviceSynchronize();
      s = wtime();
      __empty2<<<256,256>>>(false);
      cudaDeviceSynchronize();
      printf("empty2: %.3f\n", 1000*(wtime()-s));
      return 0;
    }
    $ nvcc -arch=sm_35 -o t1353 t1353.cu
    $ ./t1353
    empty1: 0.023
    empty2: 0.015
    $
    
    $cat t1353.cu
    #包括
    #包括
    #包括
    #包括
    内联双wtime(){
    双倍时间[2];
    结构timeval time1;
    gettimeofday(&time1,NULL);
    时间[0]=时间1.tv_秒;
    时间[1]=时间1.tv_usec;
    返回时间[0]+时间[1]*1.0e-6;
    }
    __全局无效空1(bool标志){if(标志){printf(“hh\n”);}
    __全局无效清空2(bool标志){if(标志){;}
    int main(){
    __空1(假);
    cudaDeviceSynchronize();
    双s=wtime();
    __空1(假);
    cudaDeviceSynchronize();
    printf(“空1:%.3f\n”,1000*(wtime()-s));
    __空2(假);
    cudaDeviceSynchronize();
    s=wtime();
    __空2(假);
    cudaDeviceSynchronize();
    printf(“empty2:%.3f\n”,1000*(wtime()-s));
    返回0;
    }
    $nvcc-arch=sm_35-o t1353 t1353.cu
    美元/t1353
    空1:0.023
    清空2:0.015
    $
    
    是什么让您认为运行时间是相同的?printf可能有很多无条件包含和执行的支持代码。除非您已经反汇编并检查了GPU在这两种情况下运行的实际SASS代码,否则您无法做出这样的假设