Cuda Can'；达不到最佳性能_Cuda_Benchmarking

Cuda Can'；达不到最佳性能

cuda

Cuda Can'；达不到最佳性能,cuda,benchmarking,Cuda,Benchmarking,我正试图通过下面的代码达到每个SM的最高性能。山顶位于25gflops（GTX275-GT200拱）之间。此代码最多提供8 GFlops __global__ void new_ker(float *x) { int index = threadIdx.x+blockIdx.x*blockDim.x; float a,b; a=0; b=x[index]; //LOOP=10000000 //No. of blocks = 1 //Threads per block

我正试图通过下面的代码达到每个SM的最高性能。山顶位于25gflops（GTX275-GT200拱）之间。此代码最多提供8 GFlops

__global__ void new_ker(float *x)
{
  int index = threadIdx.x+blockIdx.x*blockDim.x;
  float a,b;
  a=0;
  b=x[index];
  //LOOP=10000000
  //No. of blocks = 1
  //Threads per block = 512 (I'm using GTX 275 - GT200 Arch.)
  #pragma unroll 2048
  for(int i=0;i<LOOP;i++){
       a=a*b+b;
  }  

  x[index] = a;

 }

\uuuuuu全局\uuuuuuuuu无效新建\uu块（浮动*x）
{
int index=threadIdx.x+blockIdx.x*blockDim.x；
浮子a、b；
a=0；
b=x[指数]；
//循环=10000000
//区块数量=1
//每个块的线程数=512（我使用的是GTX 275-GT200拱。）
#布拉格展开2048
对于（int i=0；i，如果我使用正确的触发器计算从代码中整理出一个完整的复制案例：
#include <stdio.h> 

#define LOOP (10000000)
#define BLOCKS (30)
#define THPB (512)

__global__ void new_ker(float *x)
{
  int index = threadIdx.x+blockIdx.x*blockDim.x;
  float a,b;
  a=0;
  b=x[index];
  #pragma unroll 2048
  for(int i=0;i<LOOP;i++){
       a=a*b+b;
  }  

  x[index] = a;
}

int main(int argc,char **argv)
{

   //Initializations
   float *x;
   float *dx;
   cudaEvent_t new_start,new_stop;
   float elapsed;
   double gflops;
   x = 0;
   cudaMalloc((void **)&dx,sizeof(float)*THPB);

   //ILP=1  
   cudaEventCreate(&new_start);
   cudaEventCreate(&new_stop);
   printf("Kernel1:\n");
   cudaEventRecord(new_start, 0);
   new_ker<<<BLOCKS,THPB>>>(dx);
   cudaEventRecord(new_stop,0);
   cudaEventSynchronize(new_stop);
   cudaEventElapsedTime(&elapsed,new_start,new_stop);
   x = (float *)malloc(sizeof(float)*THPB*BLOCKS);
   cudaMemcpy(x,dx,sizeof(float)*THPB*BLOCKS,cudaMemcpyDeviceToHost);

   gflops = 2.0e-6 * ((double)(LOOP)*double(THPB*BLOCKS)/(double)elapsed);
   printf("\t%f\n",gflops);
   cudaEventDestroy(new_start);
   cudaEventDestroy(new_stop);
   return 0;
}

对于运行纯FMAD代码（1.4 GHz*2 FLOP*8核/MP*30 MP）=672 GFLOP/s的卡，我得到的峰值FLOP/s在0.01%以内
因此，实际上，代码似乎确实以每个多处理器一个块的方式达到峰值FLOP/s，但您只是没有正确计算FLOP/s数。如果您使用每个SM一个块，则可能不足以覆盖体系结构中的所有延迟。尝试启动代码允许的每个MP的最大块数，看看会发生什么s、 事实上，我正在做Vasily Volkov的实验，他只用了一个块就达到了峰值。但是他使用了高寄存器消耗和大量ILP。你的代码没有任何ILP。是的，他使用ILP达到了峰值。但是在他的第一个实验中，他尝试只增加TLP，当G80上的线程数等于192时，他达到了峰值这就是（延迟隐藏）。参考：在占用率较低的情况下性能更好（幻灯片）。那么，另一种可能是您的测量值不正确。您能否编辑这个问题来描述您如何启动内核、计时代码和计算触发器数，以及您在什么平台上运行这个问题？
#include <stdio.h> 

#define LOOP (10000000)
#define BLOCKS (30)
#define THPB (512)

__global__ void new_ker(float *x)
{
  int index = threadIdx.x+blockIdx.x*blockDim.x;
  float a,b;
  a=0;
  b=x[index];
  #pragma unroll 2048
  for(int i=0;i<LOOP;i++){
       a=a*b+b;
  }  

  x[index] = a;
}

int main(int argc,char **argv)
{

   //Initializations
   float *x;
   float *dx;
   cudaEvent_t new_start,new_stop;
   float elapsed;
   double gflops;
   x = 0;
   cudaMalloc((void **)&dx,sizeof(float)*THPB);

   //ILP=1  
   cudaEventCreate(&new_start);
   cudaEventCreate(&new_stop);
   printf("Kernel1:\n");
   cudaEventRecord(new_start, 0);
   new_ker<<<BLOCKS,THPB>>>(dx);
   cudaEventRecord(new_stop,0);
   cudaEventSynchronize(new_stop);
   cudaEventElapsedTime(&elapsed,new_start,new_stop);
   x = (float *)malloc(sizeof(float)*THPB*BLOCKS);
   cudaMemcpy(x,dx,sizeof(float)*THPB*BLOCKS,cudaMemcpyDeviceToHost);

   gflops = 2.0e-6 * ((double)(LOOP)*double(THPB*BLOCKS)/(double)elapsed);
   printf("\t%f\n",gflops);
   cudaEventDestroy(new_start);
   cudaEventDestroy(new_stop);
   return 0;
}

$ nvcc -arch=sm_13 -Xptxas="-v" -o perf perf.cu
ptxas info    : Compiling entry function '_Z7new_kerPf' for 'sm_13'
ptxas info    : Used 4 registers, 8+16 bytes smem, 8 bytes cmem[1]
$ ./perf 
Kernel1:
        671.806039