Optimization 三角优化与简化_Optimization_Cuda_Gpu_Trigonometry

Optimization 三角优化与简化

optimization cuda

Optimization 三角优化与简化,optimization,cuda,gpu,trigonometry,Optimization,Cuda,Gpu,Trigonometry,我截取了以下代表我的应用程序瓶颈的代码： double theta = acos(d); double a = cos( theta*one_third ); double b = cos( theta*one_third + M_PI_23 ); double c = cos( theta*one_third + M_PI_43 ); 其中1/3=1.0/3.0，M\u-PI\u 23=M\u-PI*2.0/3.0和M\u-PI\u 43=M\u-PI*4.0/3.0。这包含在CUDA代码中

我截取了以下代表我的应用程序瓶颈的代码：

double theta = acos(d);
double a = cos( theta*one_third );
double b = cos( theta*one_third + M_PI_23 );
double c = cos( theta*one_third + M_PI_43 );

其中

1/3=1.0/3.0

，

M\u-PI\u 23=M\u-PI*2.0/3.0

和

M\u-PI\u 43=M\u-PI*4.0/3.0

。这包含在CUDA代码中，尽管x86中也存在同样的问题

有人知道对上述内容的任何智能简化，以便我可以避免

acos

调用和/或后续

cos

调用吗？总的来说，它们占了90%的计算时间，单个

acos

调用的开销大约相当于三个

cos

调用

Thx:cos（A+B）=cosAcosB-sinAsinB

因此，改变这一点：

double a = cos( theta*one_third );

致：

然后您可以计算

，

和

，如下所示：

double a = ac;
double b = ac*cos(M_PI_23) - as*sin(M_PI_23);
double c = ac*cos(M_PI_43) - as*sin(M_PI_43);

当然，如果可能的话，您应该用编译时常量替换

cos（M_PI_xx）

和

sin（M_PI_xx）

。编译器可能会找到答案，但可能不会

下面是一个工作示例，演示了受此行为支配的代码大约30%的加速（快1.3倍）：

$ cat t874.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define one_third (1.0/3.0)
// #define M_PI 3.141592654
#define M_PI_23 (M_PI*2.0/3.0)
#define M_PI_43 (M_PI*4.0/3.0)
#define DSIZE 65536
#define nTPB 256
#define NL 100

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__global__ void tk(double *d, const double smp23, const double cmp23, const double smp43, const double cmp43, const int dsize){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < dsize){
    double theta = acos(d[idx]);
#ifndef USE_I
    d[idx+dsize]   = cos( theta*one_third );
    d[idx+2*dsize] = cos( theta*one_third + M_PI_23 );
    d[idx+3*dsize] = cos( theta*one_third + M_PI_43 );
#else
    double as, ac;
    sincos(theta*one_third, &as, &ac);
    d[idx+dsize]   = ac;
    d[idx+2*dsize] = ac*cmp23 - as*smp23;
    d[idx+3*dsize] = ac*cmp43 - as*smp43;
#endif
  }
}

int main(){

  double *h_d, *d_d;
  cudaMalloc(&d_d, 7*DSIZE*sizeof(double));
  h_d = (double *)malloc(7*DSIZE*sizeof(double));

  double smp23 = sin(M_PI_23);
  double cmp23 = cos(M_PI_23);
  double smp43 = sin(M_PI_43);
  double cmp43 = cos(M_PI_43);
  for (int i = 0; i < DSIZE; i++)
    h_d[i] = rand()/(double)RAND_MAX;
  cudaMemcpy(d_d, h_d, DSIZE*sizeof(double), cudaMemcpyHostToDevice);
  unsigned long long gtime = dtime_usec(0);
  for (int i = 0; i < NL; i++)
    tk<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_d, smp23, cmp23, smp43, cmp43, DSIZE);
  cudaDeviceSynchronize();
  gtime = dtime_usec(gtime);
  cudaCheckErrors("some error");
  printf("elapsed time: %fs\n", gtime/(float)(USECPSEC*NL));
  return 0;
}
$ nvcc -O3 t874.cu -o t874
$ ./t874
elapsed time: 0.000078s
$ nvcc -O3 -DUSE_I t874.cu -o t874
$ ./t874
elapsed time: 0.000060s
$

$cat t874.cu
#包括
#包括
#包括
#定义三分之一（1.0/3.0）
//#定义M#u PI 3.141592654
#定义M_PI_23（M_PI*2.0/3.0）
#定义M_PI_43（M_PI*4.0/3.0）
#定义DSIZE65536
#定义nTPB 256
#定义NL 100
#定义cudaCheckErrors（msg）\
做{\
cudaError\u t\u err=cudaGetLastError（）\
如果（_err！=cudaSuccess）{\
fprintf（标准，“致命错误：%s（%s位于%s:%d）\n”\
msg，cudaGetErrorString（_err）\
__文件（行）\
fprintf（stderr，“***失败-中止\n”）\
出口（1）\
} \
}而（0）
#包括
#包括
#定义USECPSEC 10000000ull
long long dtime\u usec（无符号长启动）{
蒂梅瓦尔电视；
gettimeofday（&tv，0）；
返回（（tv.tv_sec*USECPSEC）+tv.tv_usec）-开始；
}
__全局无效tk（双*d，常数双smp23，常数双cmp23，常数双smp43，常数双cmp43，常数双int dsize）{
int idx=threadIdx.x+blockDim.x*blockIdx.x；
if（idx


Fedora 20，CUDA 7.5RC，Quadro5000 GPU。
:cos（A+B）=cosAcosB-sinAsinB
因此，改变这一点：
double a = cos( theta*one_third );

致：
然后您可以计算a
，b
和c
，如下所示：
double a = ac;
double b = ac*cos(M_PI_23) - as*sin(M_PI_23);
double c = ac*cos(M_PI_43) - as*sin(M_PI_43);

当然，如果可能的话，您应该用编译时常量替换cos（M_PI_xx）
和sin（M_PI_xx）
。编译器可能会找到答案，但可能不会
下面是一个工作示例，演示了受此行为支配的代码大约30%的加速（快1.3倍）：
$ cat t874.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define one_third (1.0/3.0)
// #define M_PI 3.141592654
#define M_PI_23 (M_PI*2.0/3.0)
#define M_PI_43 (M_PI*4.0/3.0)
#define DSIZE 65536
#define nTPB 256
#define NL 100

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__global__ void tk(double *d, const double smp23, const double cmp23, const double smp43, const double cmp43, const int dsize){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < dsize){
    double theta = acos(d[idx]);
#ifndef USE_I
    d[idx+dsize]   = cos( theta*one_third );
    d[idx+2*dsize] = cos( theta*one_third + M_PI_23 );
    d[idx+3*dsize] = cos( theta*one_third + M_PI_43 );
#else
    double as, ac;
    sincos(theta*one_third, &as, &ac);
    d[idx+dsize]   = ac;
    d[idx+2*dsize] = ac*cmp23 - as*smp23;
    d[idx+3*dsize] = ac*cmp43 - as*smp43;
#endif
  }
}

int main(){

  double *h_d, *d_d;
  cudaMalloc(&d_d, 7*DSIZE*sizeof(double));
  h_d = (double *)malloc(7*DSIZE*sizeof(double));

  double smp23 = sin(M_PI_23);
  double cmp23 = cos(M_PI_23);
  double smp43 = sin(M_PI_43);
  double cmp43 = cos(M_PI_43);
  for (int i = 0; i < DSIZE; i++)
    h_d[i] = rand()/(double)RAND_MAX;
  cudaMemcpy(d_d, h_d, DSIZE*sizeof(double), cudaMemcpyHostToDevice);
  unsigned long long gtime = dtime_usec(0);
  for (int i = 0; i < NL; i++)
    tk<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_d, smp23, cmp23, smp43, cmp43, DSIZE);
  cudaDeviceSynchronize();
  gtime = dtime_usec(gtime);
  cudaCheckErrors("some error");
  printf("elapsed time: %fs\n", gtime/(float)(USECPSEC*NL));
  return 0;
}
$ nvcc -O3 t874.cu -o t874
$ ./t874
elapsed time: 0.000078s
$ nvcc -O3 -DUSE_I t874.cu -o t874
$ ./t874
elapsed time: 0.000060s
$

$cat t874.cu
#包括
#包括
#包括
#定义三分之一（1.0/3.0）
//#定义M#u PI 3.141592654
#定义M_PI_23（M_PI*2.0/3.0）
#定义M_PI_43（M_PI*4.0/3.0）
#定义DSIZE65536
#定义nTPB 256
#定义NL 100
#定义cudaCheckErrors（msg）\
做{\
cudaError\u t\u err=cudaGetLastError（）\
如果（_err！=cudaSuccess）{\
fprintf（标准，“致命错误：%s（%s位于%s:%d）\n”\
msg，cudaGetErrorString（_err）\
__文件（行）\
fprintf（stderr，“***失败-中止\n”）\
出口（1）\
} \
}而（0）
#包括
#包括
#定义USECPSEC 10000000ull
long long dtime\u usec（无符号长启动）{
蒂梅瓦尔电视；
gettimeofday（&tv，0）；
返回（（tv.tv_sec*USECPSEC）+tv.tv_usec）-开始；
}
__全局无效tk（双*d，常数双smp23，常数双cmp23，常数双smp43，常数双cmp43，常数双int dsize）{
int idx=threadIdx.x+blockDim.x*blockIdx.x；
if（idx