非均匀节点CUDA核插值的优化_Cuda_Interpolation

非均匀节点CUDA核插值的优化

cuda

非均匀节点CUDA核插值的优化,cuda,interpolation,Cuda,Interpolation,原始问题 __device__ int modulo(int val, int modulus) { if(val > 0) return val%modulus; else { int P = (-val)%modulus; if(P > 0) return modulus -P; else return 0; } } 我使用以下内核对非均匀节点点执行插值，并希望对其进行优化： __global__ void in

原始问题

__device__ int modulo(int val, int modulus)
{
   if(val > 0) return val%modulus;
   else
   {
       int P = (-val)%modulus;
       if(P > 0) return modulus -P;
       else return 0;
   }
}

我使用以下内核对非均匀节点点执行插值，并希望对其进行优化：

__global__ void interpolation(cufftDoubleComplex *Uj, double *points, cufftDoubleComplex *result, int N, int M)
{
    int i = threadIdx.x + blockDim.x * blockIdx.x;

    int PP;
    double P;
    const double alfa=(2.-1./cc)*pi_double-0.01;
    double phi_cap_s;
    cufftDoubleComplex temp;

    double cc_points=cc*points[i];
    double r_cc_points=rint(cc*points[i]);

    temp = make_cuDoubleComplex(0.,0.);

    if(i<M) {   
        for(int m=0; m<(2*K+1); m++) {
            P = (K*K-(cc_points-(r_cc_points+m-K))*(cc_points-(r_cc_points+m-K)));

            if(P>0.)  phi_cap_s = (1./pi_double)*((sinh(alfa*sqrt(P)))/sqrt(P));  
            if(P<0.)  phi_cap_s = (1./pi_double)*((sin(alfa*sqrt(-P)))/sqrt(-P));   
            if(P==0.) phi_cap_s = alfa/pi_double;        

            PP = modulo((r_cc_points + m -K ),(cc*N)); 
            temp.x = temp.x+phi_cap_s*Uj[PP].x; 
            temp.y = temp.y+phi_cap_s*Uj[PP].y; 
        } 

        result[i] = temp; 
    }
}

大约占60%。通过VisualProfiler，我已经验证了前者的性能不受

if

语句的影响。请注意，我需要双精度，因此我避免使用_exp（）解决方案。我怀疑，对于后者，“随机”内存访问Uj[PP]可能会导致如此高的计算百分比。关于减少计算时间的技巧或建议？提前谢谢

以下评论和答案的版本

根据答案和评论中善意提供的建议，我最终得出以下代码：

__global__ void interpolation(cufftDoubleComplex *Uj, double *points, cufftDoubleComplex *result, int N, int M)
{
    int i = threadIdx.x + blockDim.x * blockIdx.x;

    int PP;
    double P,tempd;
    const double alfa=(2.-1./cc)*pi_double-0.01;
    cufftDoubleComplex temp = make_cuDoubleComplex(0.,0.);

    double cc_points=cc*points[i];
    double r_cc_points=rint(cc_points);

    cufftDoubleComplex rtemp[(2*K+1)];
    double phi_cap_s[2*K+1];

    if(i<M) {   
     #pragma unroll //unroll the loop
     for(int m=0; m<(2*K+1); m++) {
         PP = modulo(((int)r_cc_points + m -K ),(cc*N)); 
            rtemp[m] = Uj[PP]; //2

         P = (K*K-(cc_points-(r_cc_points+(double)(m-K)))*(cc_points-(r_cc_points+(double)(m-K))));
         if(P<0.) {tempd=rsqrt(-P); phi_cap_s[m] = (1./pi_double)*((sin(alfa/tempd))*tempd);  }
         else if(P>0.) {tempd=rsqrt(P); phi_cap_s[m] = (1./pi_double)*((sinh(alfa/tempd))*tempd); }
         else phi_cap_s[m] = alfa/pi_double;  
     }

     #pragma unroll //unroll the loop
     for(int m=0; m<(2*K+1); m++) {
         temp.x = temp.x+phi_cap_s[m]*rtemp[m].x; 
           temp.y = temp.y+phi_cap_s[m]*rtemp[m].y; 
     } 

     result[i] = temp; 
     }
 }

最后，我想指出，本次讨论与

使用共享内存的版本

我对使用共享内存进行了可行性研究。我考虑了

N=64

，以便整个

Uj

适合共享内存。下面是代码（基本上是我的原始版本）

p值，用于第一次扭曲且m=0

 0.0124300933082964
 0.0127183892149176
 0.0135847002913749
 0.0161796378170038
 0.0155488126345702
 0.0138890822153499
 0.0121163187739057
 0.0119998374528905
 0.0131600831194518
 0.0109574866163769
 0.00962949548477354
 0.00695850974164358
 0.00446426651940612
 0.00423369284281705
 0.00632921297092537
 0.00655137618976198
 0.00810202954519923
 0.00597974034698723
 0.0076811348379735
 0.00604267951733561
 0.00402922460255439
 0.00111841719893846
 -0.00180949615796777
 -0.00246283218698551
 -0.00183256444286428
 -0.000462696661685413
 0.000725108980390132
 -0.00126793006072035
 0.00152263101649197
 0.0022499598348702
 0.00463681632275836
 0.00359856091027666

模函数

__device__ int modulo(int val, int modulus)
{
   if(val > 0) return val%modulus;
   else
   {
       int P = (-val)%modulus;
       if(P > 0) return modulus -P;
       else return 0;
   }
}

根据答案优化的模函数

__device__ int modulo(int val, int _mod)
{
    if(val > 0) return val&(_mod-1);
    else
    {
        int P = (-val)&(_mod-1);
        if(P > 0) return _mod -P;
        else return 0;
    }
}

您可能希望研究的一个优化是使用快速数学。使用intrinsics数学函数并使用-Use-fast-math选项编译

编辑表情

P = (K*K-(cc_points-(r_cc_points+(double)(m-K)))*(cc_points-(r_cc_points+(double)(m-K))));

可细分为：

const double cc_diff = cc_points-r_cc_points;
double exp = cc_diff - (double)(m-K);
exp *= exp;
P = (K*K-exp);

这可能会减少使用的指令数量

编辑2

由于我们的mod始终是2的幂的整数（

cc=2，N=64，cc*N=128

），因此我们可以使用此函数代替mod运算符。这应该“快得多”。检查一下，这样我的算术就正确了。它来自第14页。

我在Wolfram Alpha中插入了

（1/pi）*（（sin（A*sqrt（p）））/sqrt（p））

，它提出了一种没有

sin（）

的替代形式。但它有两个指数和许多其他东西。可能不值得。如果s，您能用abs（P）代替

if吗？总的来说，我觉得这个表达法没什么用。DP在GPU上是昂贵的（无论是在性能上还是在美元上）。您是否确定了您的算法是受计算限制的还是受内存限制的？如果您获得的双精度吞吐量接近您的芯片的最大可能吞吐量，我认为您已经无能为力了。占用率如何？如果您还可以使用-Xptxas–v编译并向我们显示结果。当您配置文件时，您能告诉我们指令重放开销是多少吗@JackOLantern 0.7%的指令重放开销意味着您的速度大大降低，而不是因为阻塞的扭曲而减慢，这意味着您无法真正提高性能，除非您优化指令使用。将用一个优化更新我的帖子。@jackolanten编译行怎么样？您是否已打开优化-O3并关闭调试-G？随着调试的进行，编译器无法优化内核。此外，您真正能做的就是尽量减少使用的指令和寄存器。到目前为止，您使用56个寄存器，这意味着您的占用率可能会受到影响，如果您将--maxrregcount设置为例如26-32，会发生什么情况。这将导致寄存器溢出到本地内存，但可能会加快执行速度。我不使用intrinsics math，因为它是近似值，我需要完全的双精度。对Uj[PP]内存的访问是否限制在一个块内？如果是的话，那么使用共享内存如何？我已经修改了我原来的帖子，并添加了关于使用共享内存的评论。我已经重新修改了帖子，并添加了使用共享内存的可行性研究结果。谢谢你的回答。我也根据你的回答修改了我最初的帖子，但不幸的是没有显著的改进。无论如何，请看一看。我有个问题。根据您的解决方案，Uj正在从全局内存加载到寄存器，而线程可以同时执行其他操作，这似乎是合理的。但是，在使用寄存器值之前，也就是在最后一个for循环之前，我是否还需要一个_syncthreads（）。也就是说，线程之间不相互依赖。我将看一看更新后的版本。是的，我知道我不需要_syncthreads（），但是如果内存传输和计算重叠，我如何确保Uj的值确实可用（即，它们的加载已经完成）在开始最后一个循环计算温度之前的rtemp中？如果值尚未加载到寄存器中，线程和扭曲将被阻止。因此，没有同步线程，您是安全的，因为如果值不在其中，它将不会进行计算。我已根据您修改后的答案修改了代码。IPC从1.912提高到1.936。谢谢
__device__ int modulo(int val, int modulus)
{
   if(val > 0) return val%modulus;
   else
   {
       int P = (-val)%modulus;
       if(P > 0) return modulus -P;
       else return 0;
   }
}

__device__ int modulo(int val, int _mod)
{
    if(val > 0) return val&(_mod-1);
    else
    {
        int P = (-val)&(_mod-1);
        if(P > 0) return _mod -P;
        else return 0;
    }
}

//your code above
cufftDoubleComplex rtemp[(2*K+1)] //if it fits into available registers, assumes K is a constant

if(i<M) {   
#pragma unroll //unroll the loop
    for(int m=0; m<(2*K+1); m++) {

        PP = modulo((r_cc_points + m -K ),(cc*N)); 
        rtemp[m] = Uj[PP]; //2
    }
#pragma unroll
    for(nt m=0; m<(2*K+1); m++) {
        P = (K*K-(cc_points-(r_cc_points+m-K))*(cc_points-(r_cc_points+m-K)));
        // 1
        if(P>0.)  phi_cap_s = (1./pi_double)*((sinh(alfa*sqrt(P)))/sqrt(P));  
        else if(P<0.)  phi_cap_s = (1./pi_double)*((sin(alfa*sqrt(-P)))/sqrt(-P));   
        else phi_cap_s = alfa/pi_double;  

        temp.x = temp.x+phi_cap_s*rtemp[m].x; //3
        temp.y = temp.y+phi_cap_s*rtemp[m].y; 
    }

    result[i] = temp; 
}

#pragma unroll //unroll the loop
    for(int m=0; m<(2*K+1); m++) {
        //stage memory first
        PP = modulo((r_cc_points + m -K ),(cc*N)); 
        rtemp[m] = Uj[PP]; //2

        P = (K*K-(cc_points-(r_cc_points+m-K))*(cc_points-(r_cc_points+m-K)));
        // 1
        if(P>0.)  phi_cap_s[m] = (1./pi_double)*((sinh(alfa*sqrt(P)))/sqrt(P));  
        else if(P<0.)  phi_cap_s[m] = (1./pi_double)*((sin(alfa*sqrt(-P)))/sqrt(-P));   
        else phi_cap_s[m] = alfa/pi_double; 

    }
#pragma unroll
    for(nt m=0; m<(2*K+1); m++) {
        temp.x = temp.x+phi_cap_s[m]*rtemp[m].x; //3
        temp.y = temp.y+phi_cap_s[m]*rtemp[m].y; 
    }

P = (K*K-(cc_points-(r_cc_points+(double)(m-K)))*(cc_points-(r_cc_points+(double)(m-K))));

const double cc_diff = cc_points-r_cc_points;
double exp = cc_diff - (double)(m-K);
exp *= exp;
P = (K*K-exp);

__global__ void interpolation(cufftDoubleComplex *Uj, double *points, cufftDoubleComplex *result, int N, int M)
{
    int i = threadIdx.x + blockDim.x * blockIdx.x;

    int PP;
    double P,tempd;


    cufftDoubleComplex rtemp[(2*K+1)];
    double phi_cap_s[2*K+1];

    if(i<M) {
         const double cc_points=cc*points[i];
         cufftDoubleComplex temp = make_cuDoubleComplex(0.,0.);

         const double alfa=(2.-1./cc)*pi_double-0.01;


         const double r_cc_points=rint(cc_points);
         const double cc_diff = cc_points-r_cc_points;

     #pragma unroll //unroll the loop
         for(int m=0; m<(2*K+1); m++) {
             PP = m-k; //reuse PP
             double exp = cc_diff - (double)(PP); //stage exp to be used later, will explain

             PP = modulo(((int)r_cc_points + PP ),(cc*N)); 
             rtemp[m] = Uj[PP]; //2


             exp *= exp;
             P = (K*K-exp);

             if(P<0.) {tempd=rsqrt(-P); phi_cap_s[m] = (1./pi_double)*((sin(alfa/tempd))*tempd);  }
             else if(P>0.) {tempd=rsqrt(P); phi_cap_s[m] = (1./pi_double)*((sinh(alfa/tempd))*tempd); }
             else phi_cap_s[m] = alfa/pi_double;  
         }

     #pragma unroll //unroll the loop
         for(int m=0; m<(2*K+1); m++) {
             temp.x = temp.x+phi_cap_s[m]*rtemp[m].x; 
             temp.y = temp.y+phi_cap_s[m]*rtemp[m].y; 
         } 

     result[i] = temp; 
     }
 }

__device__ modulo(int val, int _mod) {
    int p = (val&(_mod-1));// as modulo is always the power of 2
    if(val < 0) {
        return _mod - p;
    } else {
        return p;
    }
}