C代码的优化_C_Performance_Optimization_Compiler Optimization_Bit Shift

C代码的优化

c performance optimization

C代码的优化,c,performance,optimization,compiler-optimization,bit-shift,C,Performance,Optimization,Compiler Optimization,Bit Shift,对于名为“高性能计算”课程的作业，我需要优化以下代码片段： int foobar(int a, int b, int N) { int i, j, k, x, y; x = 0; y = 0; k = 256; for (i = 0; i <= N; i++) { for (j = i + 1; j <= N; j++) { x = x + 4*(2*i+j)*(i+2*k); i

对于名为“高性能计算”课程的作业，我需要优化以下代码片段：

int foobar(int a, int b, int N)
{
    int i, j, k, x, y;
    x = 0;
    y = 0;
    k = 256;
    for (i = 0; i <= N; i++) {
        for (j = i + 1; j <= N; j++) {
            x = x + 4*(2*i+j)*(i+2*k);
            if (i > j){
               y = y + 8*(i-j);
            }else{
               y = y + 8*(j-i);
            }
        }
    }
    return x;
}

我还可以看到其他一些东西。您不需要

，因此可以删除其声明和初始化

此外，为

和

传入的值实际上并未使用，因此您可以将它们用作局部变量，而不是

和

另外，您可以注意到，

从512开始，每次迭代递增1，而不是每次通过将

添加到512

int foobar(int a, int b, int N) {
    int i, j;
    a = 0;
    b = 512;
    for (i = 0; i <= N; i++, b++) {
        for (j = i + 1; j <= N; j++) {
            a = a + ((i<<3) + (j<<2))*b;
        }
    }
    return a;
}

简单地扫描第一个例程，您注意到的第一件事是，涉及“y”的表达式完全未使用，可以删除（就像您所做的那样）。这进一步允许消除if/else（正如您所做的那样）

剩下的是两个

for

循环和凌乱的表达式。下一步是分解不依赖于

的表达式片段。您删除了一个这样的表达式，但是

（iy
不会影响代码的最终结果-删除：
int foobar(int a, int b, int N)
{
    int i, j, k, x, y;
    x = 0;
    //y = 0;
    k = 256;
    for (i = 0; i <= N; i++) {
        for (j = i + 1; j <= N; j++) {
            x = x + 4*(2*i+j)*(i+2*k);
            //if (i > j){
            //   y = y + 8*(i-j);
            //}else{
            //   y = y + 8*(j-i);
            //}
        }
    }
    return x;
}

内部表达式可以转换为：x+=8*i*i+4096*i+4*i*j+2048*j
。使用数学将它们全部推到外部循环：x+=8*i*i*（N-i）+4096*i*（N-i）+2*i*（N-i）*（N+i+1）+1024*（N-i）*（N+i+1）

您可以展开上面的表达式，然后应用以获得一个闭合形式的表达式，该表达式应该比双嵌套循环运行得更快。我将其作为练习留给您。因此，I
和j
也将被删除
a
和b
也应尽可能删除，因为a
和b
作为参数提供，但从未在代码中使用
平方和与立方和公式：

和（x2，x=1..n）=n（n+1）（2n+1）/6
和（x3，x=1..n）=n2（n+1）2/4
最初：
for (i = 0; i <= N; i++) {
    for (j = i + 1; j <= N; j++) {
        x = x + 4*(2*i+j)*(i+2*k);
        if (i > j){
           y = y + 8*(i-j);
        }else{
           y = y + 8*(j-i);
        }
    }
}

拆分i
，j
，k
：
for (i = 0; i <= N; i++) {
    for (j = i + 1; j <= N; j++) {
        x = x + 8*i*i + 16*i*k ;                // multiple of  1  (no j)
        x = x + (4*i + 8*k)*j ;                 // multiple of  j
    }
}

重写：
for (i = 0; i <= N; i++) {
    x = x +         ( 8*k*(N*N+N)/2 ) ;
    x = x +   i   * ( 16*k*N + 4*(N*N+N)/2 + 8*k*(-1/2) ) ;
    x = x +  i*i  * ( 8*N + 16*k*(-1) + 4*(-1/2) + 8*k*(-1/2) );
    x = x + i*i*i * ( 8*(-1) + 4*(-1/2) ) ;
}

上述两种循环删除均使用公式：
和（1，i=0..n）=n+1

和（i1，i=0..n）=n（n+1）/2

和（i2，i=0..n）=n（n+1）（2n+1）/6

和（i3，i=0..n）=n2（n+1）2/4
此函数等效于以下公式，其中仅包含4个整数乘法，以及1个整数除法：
x = N * (N + 1) * (N * (7 * N + 8187) - 2050) / 6;

为了得到这个结果，我只需将嵌套循环计算的总和输入到：
是解决方案的直接链接。在编码之前先考虑一下。有时候，你的大脑比任何编译器都能更好地优化代码。
好的……下面是我的解决方案，以及解释我做了什么和如何做的内联注释
int foobar(int N)
{ // We eliminate unused arguments 
    int x = 0, i = 0, i2 = 0, j, k, z;

    // We only iterate up to N on the outer loop, since the
    // last iteration doesn't do anything useful. Also we keep
    // track of '2*i' (which is used throughout the code) by a 
    // second variable 'i2' which we increment by two in every
    // iteration, essentially converting multiplication into addition.
    while(i < N) 
    {           
        // We hoist the calculation '4 * (i+2*k)' out of the loop
        // since k is a literal constant and 'i' is a constant during
        // the inner loop. We could convert the multiplication by 2
        // into a left shift, but hey, let's not go *crazy*! 
        //
        //  (4 * (i+2*k))         <=>
        //  (4 * i) + (4 * 2 * k) <=>
        //  (2 * i2) + (8 * k)    <=>
        //  (2 * i2) + (8 * 512)  <=>
        //  (2 * i2) + 2048

        k = (2 * i2) + 2048;

        // We have now converted the expression:
        //      x = x + 4*(2*i+j)*(i+2*k);
        //
        // into the expression:
        //      x = x + (i2 + j) * k;
        //
        // Counterintuively we now *expand* the formula into:
        //      x = x + (i2 * k) + (j * k);
        //
        // Now observe that (i2 * k) is a constant inside the inner
        // loop which we can calculate only once here. Also observe
        // that is simply added into x a total (N - i) times, so 
        // we take advantange of the abelian nature of addition
        // to hoist it completely out of the loop

        x = x + (i2 * k) * (N - i);

        // Observe that inside this loop we calculate (j * k) repeatedly, 
        // and that j is just an increasing counter. So now instead of
        // doing numerous multiplications, let's break the operation into
        // two parts: a multiplication, which we hoist out of the inner 
        // loop and additions which we continue performing in the inner 
        // loop.

        z = i * k;

        for (j = i + 1; j <= N; j++) 
        {
            z = z + k;          
            x = x + z;      
        }

        i++;
        i2 += 2;
    }   

    return x;
}

intfoobar（intn）
{//我们消除了未使用的参数
int x=0，i=0，i2=0，j，k，z；
//由于
//最后一次迭代没有做任何有用的事情。我们也保留了
//a的“2*i”（在整个代码中使用）轨迹
//第二个变量“i2”，我们在每个
//迭代，本质上是将乘法转换为加法。
而（i
{
int i，j，x=0；//删除无用变量，操作以保存堆栈和机器周期
for（i=N；i-->）//不要检查不必要的比较条件
对于（j=N+1；--j>i；）
x+=（（（i恒定传播、代数简化、复制传播、公共子表达式消除、死代码消除、循环不变消除，以及使用位移位而不是乘法，因为它们成本较低-有趣的事实：这些正是“简单的”现代优化编译器可以自己完成的优化，而且通常比程序员做得更好；因此，程序员通常关心更“高级”（例如算法）的优化（或更微妙的东西，例如与缓存相关的东西，通常需要分析）。告诉编译器你想要所有的优化提示，因为？用-O3
运行gcc应该会有一些效果。虽然我不确定优化编译器在多大程度上违背了本练习的目的。不幸的是，大量的“HPC编程”课程看起来像是从20世纪70年代直接传送过来的。在那里，完成了。不幸的是
for (i = 0; i <= N; i++) {
    for (j = i + 1; j <= N; j++) {
        x = x + 4*(2*i+j)*(i+2*k);
    }
}

for (i = 0; i <= N; i++) {
    for (j = i + 1; j <= N; j++) {
        x = x + 8*i*i + 16*i*k ;                // multiple of  1  (no j)
        x = x + (4*i + 8*k)*j ;                 // multiple of  j
    }
}

for (i = 0; i <= N; i++) {
    x = x + (8*i*i + 16*i*k) * (N-i) ;
    x = x + (4*i + 8*k) * ((N*N+N)/2 - (i*i+i)/2) ;
}

for (i = 0; i <= N; i++) {
    x = x +         ( 8*k*(N*N+N)/2 ) ;
    x = x +   i   * ( 16*k*N + 4*(N*N+N)/2 + 8*k*(-1/2) ) ;
    x = x +  i*i  * ( 8*N + 16*k*(-1) + 4*(-1/2) + 8*k*(-1/2) );
    x = x + i*i*i * ( 8*(-1) + 4*(-1/2) ) ;
}

for (i = 0; i <= N; i++) {
    x = x + 4*k*(N*N+N) ;                            // multiple of 1
    x = x +   i   * ( 16*k*N + 2*(N*N+N) - 4*k ) ;   // multiple of i
    x = x +  i*i  * ( 8*N - 20*k - 2 ) ;             // multiple of i^2
    x = x + i*i*i * ( -10 ) ;                        // multiple of i^3
}

x = x + ( 4*k*(N*N+N) )              * (N+1) ;
x = x + ( 16*k*N + 2*(N*N+N) - 4*k ) * ((N*(N+1))/2) ;
x = x + ( 8*N - 20*k - 2 )           * ((N*(N+1)*(2*N+1))/6);
x = x + (-10)                        * ((N*N*(N+1)*(N+1))/4) ;

x = N * (N + 1) * (N * (7 * N + 8187) - 2050) / 6;

sum (sum (8*i*i+4096*i+4*i*j+2048*j), j=i+1..N), i=0..N

int foobar(int N)
{ // We eliminate unused arguments 
    int x = 0, i = 0, i2 = 0, j, k, z;

    // We only iterate up to N on the outer loop, since the
    // last iteration doesn't do anything useful. Also we keep
    // track of '2*i' (which is used throughout the code) by a 
    // second variable 'i2' which we increment by two in every
    // iteration, essentially converting multiplication into addition.
    while(i < N) 
    {           
        // We hoist the calculation '4 * (i+2*k)' out of the loop
        // since k is a literal constant and 'i' is a constant during
        // the inner loop. We could convert the multiplication by 2
        // into a left shift, but hey, let's not go *crazy*! 
        //
        //  (4 * (i+2*k))         <=>
        //  (4 * i) + (4 * 2 * k) <=>
        //  (2 * i2) + (8 * k)    <=>
        //  (2 * i2) + (8 * 512)  <=>
        //  (2 * i2) + 2048

        k = (2 * i2) + 2048;

        // We have now converted the expression:
        //      x = x + 4*(2*i+j)*(i+2*k);
        //
        // into the expression:
        //      x = x + (i2 + j) * k;
        //
        // Counterintuively we now *expand* the formula into:
        //      x = x + (i2 * k) + (j * k);
        //
        // Now observe that (i2 * k) is a constant inside the inner
        // loop which we can calculate only once here. Also observe
        // that is simply added into x a total (N - i) times, so 
        // we take advantange of the abelian nature of addition
        // to hoist it completely out of the loop

        x = x + (i2 * k) * (N - i);

        // Observe that inside this loop we calculate (j * k) repeatedly, 
        // and that j is just an increasing counter. So now instead of
        // doing numerous multiplications, let's break the operation into
        // two parts: a multiplication, which we hoist out of the inner 
        // loop and additions which we continue performing in the inner 
        // loop.

        z = i * k;

        for (j = i + 1; j <= N; j++) 
        {
            z = z + k;          
            x = x + z;      
        }

        i++;
        i2 += 2;
    }   

    return x;
}

int foobar(int N)
{
    int x = 0, i = 0, i2 = 0, j, k, z;

    while(i < N) 
    {                   
        k = (2 * i2) + 2048;

        x = x + (i2 * k) * (N - i);

        z = i * k;

        for (j = i + 1; j <= N; j++) 
        {
            z = z + k;          
            x = x + z;      
        }

        i++;
        i2 += 2;
    }   

    return x;
}

int i, j, x=0;   //Remove unuseful variable, operation so save stack and Machine cycle

for (i = N; i--; )               //Don't check unnecessary comparison condition 

   for (j = N+1; --j>i; )

     x += (((i<<1)+j)*(i+512)<<2);  //Save Machine cycle ,Use shift instead of Multiply

return x;