Optimization 三角优化与简化

Optimization 三角优化与简化,optimization,cuda,gpu,trigonometry,Optimization,Cuda,Gpu,Trigonometry,我截取了以下代表我的应用程序瓶颈的代码: double theta = acos(d); double a = cos( theta*one_third ); double b = cos( theta*one_third + M_PI_23 ); double c = cos( theta*one_third + M_PI_43 ); 其中1/3=1.0/3.0,M\u-PI\u 23=M\u-PI*2.0/3.0和M\u-PI\u 43=M\u-PI*4.0/3.0。这包含在CUDA代码中

我截取了以下代表我的应用程序瓶颈的代码:

double theta = acos(d);
double a = cos( theta*one_third );
double b = cos( theta*one_third + M_PI_23 );
double c = cos( theta*one_third + M_PI_43 );
其中
1/3=1.0/3.0
M\u-PI\u 23=M\u-PI*2.0/3.0
M\u-PI\u 43=M\u-PI*4.0/3.0
。这包含在CUDA代码中,尽管x86中也存在同样的问题

有人知道对上述内容的任何智能简化,以便我可以避免
acos
调用和/或后续
cos
调用吗?总的来说,它们占了90%的计算时间,单个
acos
调用的开销大约相当于三个
cos
调用

Thx:cos(A+B)=cosAcosB-sinAsinB

因此,改变这一点:

double a = cos( theta*one_third );
致:

然后您可以计算
a
b
c
,如下所示:

double a = ac;
double b = ac*cos(M_PI_23) - as*sin(M_PI_23);
double c = ac*cos(M_PI_43) - as*sin(M_PI_43);
当然,如果可能的话,您应该用编译时常量替换
cos(M_PI_xx)
sin(M_PI_xx)
。编译器可能会找到答案,但可能不会

下面是一个工作示例,演示了受此行为支配的代码大约30%的加速(快1.3倍):

$ cat t874.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define one_third (1.0/3.0)
// #define M_PI 3.141592654
#define M_PI_23 (M_PI*2.0/3.0)
#define M_PI_43 (M_PI*4.0/3.0)
#define DSIZE 65536
#define nTPB 256
#define NL 100

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__global__ void tk(double *d, const double smp23, const double cmp23, const double smp43, const double cmp43, const int dsize){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < dsize){
    double theta = acos(d[idx]);
#ifndef USE_I
    d[idx+dsize]   = cos( theta*one_third );
    d[idx+2*dsize] = cos( theta*one_third + M_PI_23 );
    d[idx+3*dsize] = cos( theta*one_third + M_PI_43 );
#else
    double as, ac;
    sincos(theta*one_third, &as, &ac);
    d[idx+dsize]   = ac;
    d[idx+2*dsize] = ac*cmp23 - as*smp23;
    d[idx+3*dsize] = ac*cmp43 - as*smp43;
#endif
  }
}

int main(){

  double *h_d, *d_d;
  cudaMalloc(&d_d, 7*DSIZE*sizeof(double));
  h_d = (double *)malloc(7*DSIZE*sizeof(double));

  double smp23 = sin(M_PI_23);
  double cmp23 = cos(M_PI_23);
  double smp43 = sin(M_PI_43);
  double cmp43 = cos(M_PI_43);
  for (int i = 0; i < DSIZE; i++)
    h_d[i] = rand()/(double)RAND_MAX;
  cudaMemcpy(d_d, h_d, DSIZE*sizeof(double), cudaMemcpyHostToDevice);
  unsigned long long gtime = dtime_usec(0);
  for (int i = 0; i < NL; i++)
    tk<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_d, smp23, cmp23, smp43, cmp43, DSIZE);
  cudaDeviceSynchronize();
  gtime = dtime_usec(gtime);
  cudaCheckErrors("some error");
  printf("elapsed time: %fs\n", gtime/(float)(USECPSEC*NL));
  return 0;
}
$ nvcc -O3 t874.cu -o t874
$ ./t874
elapsed time: 0.000078s
$ nvcc -O3 -DUSE_I t874.cu -o t874
$ ./t874
elapsed time: 0.000060s
$
$cat t874.cu
#包括
#包括
#包括
#定义三分之一(1.0/3.0)
//#定义M#u PI 3.141592654
#定义M_PI_23(M_PI*2.0/3.0)
#定义M_PI_43(M_PI*4.0/3.0)
#定义DSIZE65536
#定义nTPB 256
#定义NL 100
#定义cudaCheckErrors(msg)\
做{\
cudaError\u t\u err=cudaGetLastError()\
如果(_err!=cudaSuccess){\
fprintf(标准,“致命错误:%s(%s位于%s:%d)\n”\
msg,cudaGetErrorString(_err)\
__文件(行)\
fprintf(stderr,“***失败-中止\n”)\
出口(1)\
} \
}而(0)
#包括
#包括
#定义USECPSEC 10000000ull
long long dtime\u usec(无符号长启动){
蒂梅瓦尔电视;
gettimeofday(&tv,0);
返回((tv.tv_sec*USECPSEC)+tv.tv_usec)-开始;
}
__全局无效tk(双*d,常数双smp23,常数双cmp23,常数双smp43,常数双cmp43,常数双int dsize){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if(idx
Fedora 20,CUDA 7.5RC,Quadro5000 GPU。

:cos(A+B)=cosAcosB-sinAsinB

因此,改变这一点:

double a = cos( theta*one_third );
致:

然后您可以计算
a
b
c
,如下所示:

double a = ac;
double b = ac*cos(M_PI_23) - as*sin(M_PI_23);
double c = ac*cos(M_PI_43) - as*sin(M_PI_43);
当然,如果可能的话,您应该用编译时常量替换
cos(M_PI_xx)
sin(M_PI_xx)
。编译器可能会找到答案,但可能不会

下面是一个工作示例,演示了受此行为支配的代码大约30%的加速(快1.3倍):

$ cat t874.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define one_third (1.0/3.0)
// #define M_PI 3.141592654
#define M_PI_23 (M_PI*2.0/3.0)
#define M_PI_43 (M_PI*4.0/3.0)
#define DSIZE 65536
#define nTPB 256
#define NL 100

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__global__ void tk(double *d, const double smp23, const double cmp23, const double smp43, const double cmp43, const int dsize){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < dsize){
    double theta = acos(d[idx]);
#ifndef USE_I
    d[idx+dsize]   = cos( theta*one_third );
    d[idx+2*dsize] = cos( theta*one_third + M_PI_23 );
    d[idx+3*dsize] = cos( theta*one_third + M_PI_43 );
#else
    double as, ac;
    sincos(theta*one_third, &as, &ac);
    d[idx+dsize]   = ac;
    d[idx+2*dsize] = ac*cmp23 - as*smp23;
    d[idx+3*dsize] = ac*cmp43 - as*smp43;
#endif
  }
}

int main(){

  double *h_d, *d_d;
  cudaMalloc(&d_d, 7*DSIZE*sizeof(double));
  h_d = (double *)malloc(7*DSIZE*sizeof(double));

  double smp23 = sin(M_PI_23);
  double cmp23 = cos(M_PI_23);
  double smp43 = sin(M_PI_43);
  double cmp43 = cos(M_PI_43);
  for (int i = 0; i < DSIZE; i++)
    h_d[i] = rand()/(double)RAND_MAX;
  cudaMemcpy(d_d, h_d, DSIZE*sizeof(double), cudaMemcpyHostToDevice);
  unsigned long long gtime = dtime_usec(0);
  for (int i = 0; i < NL; i++)
    tk<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_d, smp23, cmp23, smp43, cmp43, DSIZE);
  cudaDeviceSynchronize();
  gtime = dtime_usec(gtime);
  cudaCheckErrors("some error");
  printf("elapsed time: %fs\n", gtime/(float)(USECPSEC*NL));
  return 0;
}
$ nvcc -O3 t874.cu -o t874
$ ./t874
elapsed time: 0.000078s
$ nvcc -O3 -DUSE_I t874.cu -o t874
$ ./t874
elapsed time: 0.000060s
$
$cat t874.cu
#包括
#包括
#包括
#定义三分之一(1.0/3.0)
//#定义M#u PI 3.141592654
#定义M_PI_23(M_PI*2.0/3.0)
#定义M_PI_43(M_PI*4.0/3.0)
#定义DSIZE65536
#定义nTPB 256
#定义NL 100
#定义cudaCheckErrors(msg)\
做{\
cudaError\u t\u err=cudaGetLastError()\
如果(_err!=cudaSuccess){\
fprintf(标准,“致命错误:%s(%s位于%s:%d)\n”\
msg,cudaGetErrorString(_err)\
__文件(行)\
fprintf(stderr,“***失败-中止\n”)\
出口(1)\
} \
}而(0)
#包括
#包括
#定义USECPSEC 10000000ull
long long dtime\u usec(无符号长启动){
蒂梅瓦尔电视;
gettimeofday(&tv,0);
返回((tv.tv_sec*USECPSEC)+tv.tv_usec)-开始;
}
__全局无效tk(双*d,常数双smp23,常数双cmp23,常数双smp43,常数双cmp43,常数双int dsize){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if(idx