Optimization 三角优化与简化
我截取了以下代表我的应用程序瓶颈的代码:Optimization 三角优化与简化,optimization,cuda,gpu,trigonometry,Optimization,Cuda,Gpu,Trigonometry,我截取了以下代表我的应用程序瓶颈的代码: double theta = acos(d); double a = cos( theta*one_third ); double b = cos( theta*one_third + M_PI_23 ); double c = cos( theta*one_third + M_PI_43 ); 其中1/3=1.0/3.0,M\u-PI\u 23=M\u-PI*2.0/3.0和M\u-PI\u 43=M\u-PI*4.0/3.0。这包含在CUDA代码中
double theta = acos(d);
double a = cos( theta*one_third );
double b = cos( theta*one_third + M_PI_23 );
double c = cos( theta*one_third + M_PI_43 );
其中1/3=1.0/3.0
,M\u-PI\u 23=M\u-PI*2.0/3.0
和M\u-PI\u 43=M\u-PI*4.0/3.0
。这包含在CUDA代码中,尽管x86中也存在同样的问题
有人知道对上述内容的任何智能简化,以便我可以避免acos
调用和/或后续cos
调用吗?总的来说,它们占了90%的计算时间,单个acos
调用的开销大约相当于三个cos
调用
Thx:cos(A+B)=cosAcosB-sinAsinB
因此,改变这一点:
double a = cos( theta*one_third );
致:
然后您可以计算a
,b
和c
,如下所示:
double a = ac;
double b = ac*cos(M_PI_23) - as*sin(M_PI_23);
double c = ac*cos(M_PI_43) - as*sin(M_PI_43);
当然,如果可能的话,您应该用编译时常量替换cos(M_PI_xx)
和sin(M_PI_xx)
。编译器可能会找到答案,但可能不会
下面是一个工作示例,演示了受此行为支配的代码大约30%的加速(快1.3倍):
$ cat t874.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define one_third (1.0/3.0)
// #define M_PI 3.141592654
#define M_PI_23 (M_PI*2.0/3.0)
#define M_PI_43 (M_PI*4.0/3.0)
#define DSIZE 65536
#define nTPB 256
#define NL 100
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void tk(double *d, const double smp23, const double cmp23, const double smp43, const double cmp43, const int dsize){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < dsize){
double theta = acos(d[idx]);
#ifndef USE_I
d[idx+dsize] = cos( theta*one_third );
d[idx+2*dsize] = cos( theta*one_third + M_PI_23 );
d[idx+3*dsize] = cos( theta*one_third + M_PI_43 );
#else
double as, ac;
sincos(theta*one_third, &as, &ac);
d[idx+dsize] = ac;
d[idx+2*dsize] = ac*cmp23 - as*smp23;
d[idx+3*dsize] = ac*cmp43 - as*smp43;
#endif
}
}
int main(){
double *h_d, *d_d;
cudaMalloc(&d_d, 7*DSIZE*sizeof(double));
h_d = (double *)malloc(7*DSIZE*sizeof(double));
double smp23 = sin(M_PI_23);
double cmp23 = cos(M_PI_23);
double smp43 = sin(M_PI_43);
double cmp43 = cos(M_PI_43);
for (int i = 0; i < DSIZE; i++)
h_d[i] = rand()/(double)RAND_MAX;
cudaMemcpy(d_d, h_d, DSIZE*sizeof(double), cudaMemcpyHostToDevice);
unsigned long long gtime = dtime_usec(0);
for (int i = 0; i < NL; i++)
tk<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_d, smp23, cmp23, smp43, cmp43, DSIZE);
cudaDeviceSynchronize();
gtime = dtime_usec(gtime);
cudaCheckErrors("some error");
printf("elapsed time: %fs\n", gtime/(float)(USECPSEC*NL));
return 0;
}
$ nvcc -O3 t874.cu -o t874
$ ./t874
elapsed time: 0.000078s
$ nvcc -O3 -DUSE_I t874.cu -o t874
$ ./t874
elapsed time: 0.000060s
$
$cat t874.cu
#包括
#包括
#包括
#定义三分之一(1.0/3.0)
//#定义M#u PI 3.141592654
#定义M_PI_23(M_PI*2.0/3.0)
#定义M_PI_43(M_PI*4.0/3.0)
#定义DSIZE65536
#定义nTPB 256
#定义NL 100
#定义cudaCheckErrors(msg)\
做{\
cudaError\u t\u err=cudaGetLastError()\
如果(_err!=cudaSuccess){\
fprintf(标准,“致命错误:%s(%s位于%s:%d)\n”\
msg,cudaGetErrorString(_err)\
__文件(行)\
fprintf(stderr,“***失败-中止\n”)\
出口(1)\
} \
}而(0)
#包括
#包括
#定义USECPSEC 10000000ull
long long dtime\u usec(无符号长启动){
蒂梅瓦尔电视;
gettimeofday(&tv,0);
返回((tv.tv_sec*USECPSEC)+tv.tv_usec)-开始;
}
__全局无效tk(双*d,常数双smp23,常数双cmp23,常数双smp43,常数双cmp43,常数双int dsize){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if(idx
Fedora 20,CUDA 7.5RC,Quadro5000 GPU。:cos(A+B)=cosAcosB-sinAsinB
因此,改变这一点:
double a = cos( theta*one_third );
致:
然后您可以计算a
,b
和c
,如下所示:
double a = ac;
double b = ac*cos(M_PI_23) - as*sin(M_PI_23);
double c = ac*cos(M_PI_43) - as*sin(M_PI_43);
当然,如果可能的话,您应该用编译时常量替换cos(M_PI_xx)
和sin(M_PI_xx)
。编译器可能会找到答案,但可能不会
下面是一个工作示例,演示了受此行为支配的代码大约30%的加速(快1.3倍):
$ cat t874.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define one_third (1.0/3.0)
// #define M_PI 3.141592654
#define M_PI_23 (M_PI*2.0/3.0)
#define M_PI_43 (M_PI*4.0/3.0)
#define DSIZE 65536
#define nTPB 256
#define NL 100
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void tk(double *d, const double smp23, const double cmp23, const double smp43, const double cmp43, const int dsize){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < dsize){
double theta = acos(d[idx]);
#ifndef USE_I
d[idx+dsize] = cos( theta*one_third );
d[idx+2*dsize] = cos( theta*one_third + M_PI_23 );
d[idx+3*dsize] = cos( theta*one_third + M_PI_43 );
#else
double as, ac;
sincos(theta*one_third, &as, &ac);
d[idx+dsize] = ac;
d[idx+2*dsize] = ac*cmp23 - as*smp23;
d[idx+3*dsize] = ac*cmp43 - as*smp43;
#endif
}
}
int main(){
double *h_d, *d_d;
cudaMalloc(&d_d, 7*DSIZE*sizeof(double));
h_d = (double *)malloc(7*DSIZE*sizeof(double));
double smp23 = sin(M_PI_23);
double cmp23 = cos(M_PI_23);
double smp43 = sin(M_PI_43);
double cmp43 = cos(M_PI_43);
for (int i = 0; i < DSIZE; i++)
h_d[i] = rand()/(double)RAND_MAX;
cudaMemcpy(d_d, h_d, DSIZE*sizeof(double), cudaMemcpyHostToDevice);
unsigned long long gtime = dtime_usec(0);
for (int i = 0; i < NL; i++)
tk<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_d, smp23, cmp23, smp43, cmp43, DSIZE);
cudaDeviceSynchronize();
gtime = dtime_usec(gtime);
cudaCheckErrors("some error");
printf("elapsed time: %fs\n", gtime/(float)(USECPSEC*NL));
return 0;
}
$ nvcc -O3 t874.cu -o t874
$ ./t874
elapsed time: 0.000078s
$ nvcc -O3 -DUSE_I t874.cu -o t874
$ ./t874
elapsed time: 0.000060s
$
$cat t874.cu
#包括
#包括
#包括
#定义三分之一(1.0/3.0)
//#定义M#u PI 3.141592654
#定义M_PI_23(M_PI*2.0/3.0)
#定义M_PI_43(M_PI*4.0/3.0)
#定义DSIZE65536
#定义nTPB 256
#定义NL 100
#定义cudaCheckErrors(msg)\
做{\
cudaError\u t\u err=cudaGetLastError()\
如果(_err!=cudaSuccess){\
fprintf(标准,“致命错误:%s(%s位于%s:%d)\n”\
msg,cudaGetErrorString(_err)\
__文件(行)\
fprintf(stderr,“***失败-中止\n”)\
出口(1)\
} \
}而(0)
#包括
#包括
#定义USECPSEC 10000000ull
long long dtime\u usec(无符号长启动){
蒂梅瓦尔电视;
gettimeofday(&tv,0);
返回((tv.tv_sec*USECPSEC)+tv.tv_usec)-开始;
}
__全局无效tk(双*d,常数双smp23,常数双cmp23,常数双smp43,常数双cmp43,常数双int dsize){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if(idx