C代码的优化
对于名为“高性能计算”课程的作业,我需要优化以下代码片段:C代码的优化,c,performance,optimization,compiler-optimization,bit-shift,C,Performance,Optimization,Compiler Optimization,Bit Shift,对于名为“高性能计算”课程的作业,我需要优化以下代码片段: int foobar(int a, int b, int N) { int i, j, k, x, y; x = 0; y = 0; k = 256; for (i = 0; i <= N; i++) { for (j = i + 1; j <= N; j++) { x = x + 4*(2*i+j)*(i+2*k); i
int foobar(int a, int b, int N)
{
int i, j, k, x, y;
x = 0;
y = 0;
k = 256;
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 4*(2*i+j)*(i+2*k);
if (i > j){
y = y + 8*(i-j);
}else{
y = y + 8*(j-i);
}
}
}
return x;
}
我还可以看到其他一些东西。您不需要
y
,因此可以删除其声明和初始化
此外,为a
和b
传入的值实际上并未使用,因此您可以将它们用作局部变量,而不是x
和t
另外,您可以注意到,t
从512开始,每次迭代递增1,而不是每次通过将i
添加到512
int foobar(int a, int b, int N) {
int i, j;
a = 0;
b = 512;
for (i = 0; i <= N; i++, b++) {
for (j = i + 1; j <= N; j++) {
a = a + ((i<<3) + (j<<2))*b;
}
}
return a;
}
简单地扫描第一个例程,您注意到的第一件事是,涉及“y”的表达式完全未使用,可以删除(就像您所做的那样)。这进一步允许消除if/else(正如您所做的那样)
剩下的是两个
for
循环和凌乱的表达式。下一步是分解不依赖于j
的表达式片段。您删除了一个这样的表达式,但是(iy
不会影响代码的最终结果-删除:
int foobar(int a, int b, int N)
{
int i, j, k, x, y;
x = 0;
//y = 0;
k = 256;
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 4*(2*i+j)*(i+2*k);
//if (i > j){
// y = y + 8*(i-j);
//}else{
// y = y + 8*(j-i);
//}
}
}
return x;
}
内部表达式可以转换为:x+=8*i*i+4096*i+4*i*j+2048*j
。使用数学将它们全部推到外部循环:x+=8*i*i*(N-i)+4096*i*(N-i)+2*i*(N-i)*(N+i+1)+1024*(N-i)*(N+i+1)
您可以展开上面的表达式,然后应用以获得一个闭合形式的表达式,该表达式应该比双嵌套循环运行得更快。我将其作为练习留给您。因此,I
和j
也将被删除
a
和b
也应尽可能删除,因为a
和b
作为参数提供,但从未在代码中使用
平方和与立方和公式:
- 和(x2,x=1..n)=n(n+1)(2n+1)/6
- 和(x3,x=1..n)=n2(n+1)2/4
最初:
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 4*(2*i+j)*(i+2*k);
if (i > j){
y = y + 8*(i-j);
}else{
y = y + 8*(j-i);
}
}
}
拆分i
,j
,k
:
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 8*i*i + 16*i*k ; // multiple of 1 (no j)
x = x + (4*i + 8*k)*j ; // multiple of j
}
}
重写:
for (i = 0; i <= N; i++) {
x = x + ( 8*k*(N*N+N)/2 ) ;
x = x + i * ( 16*k*N + 4*(N*N+N)/2 + 8*k*(-1/2) ) ;
x = x + i*i * ( 8*N + 16*k*(-1) + 4*(-1/2) + 8*k*(-1/2) );
x = x + i*i*i * ( 8*(-1) + 4*(-1/2) ) ;
}
上述两种循环删除均使用公式:
和(1,i=0..n)=n+1
和(i1,i=0..n)=n(n+1)/2
和(i2,i=0..n)=n(n+1)(2n+1)/6
和(i3,i=0..n)=n2(n+1)2/4
此函数等效于以下公式,其中仅包含4个整数乘法,以及1个整数除法:
x = N * (N + 1) * (N * (7 * N + 8187) - 2050) / 6;
为了得到这个结果,我只需将嵌套循环计算的总和输入到:
是解决方案的直接链接。在编码之前先考虑一下。有时候,你的大脑比任何编译器都能更好地优化代码。好的……下面是我的解决方案,以及解释我做了什么和如何做的内联注释
int foobar(int N)
{ // We eliminate unused arguments
int x = 0, i = 0, i2 = 0, j, k, z;
// We only iterate up to N on the outer loop, since the
// last iteration doesn't do anything useful. Also we keep
// track of '2*i' (which is used throughout the code) by a
// second variable 'i2' which we increment by two in every
// iteration, essentially converting multiplication into addition.
while(i < N)
{
// We hoist the calculation '4 * (i+2*k)' out of the loop
// since k is a literal constant and 'i' is a constant during
// the inner loop. We could convert the multiplication by 2
// into a left shift, but hey, let's not go *crazy*!
//
// (4 * (i+2*k)) <=>
// (4 * i) + (4 * 2 * k) <=>
// (2 * i2) + (8 * k) <=>
// (2 * i2) + (8 * 512) <=>
// (2 * i2) + 2048
k = (2 * i2) + 2048;
// We have now converted the expression:
// x = x + 4*(2*i+j)*(i+2*k);
//
// into the expression:
// x = x + (i2 + j) * k;
//
// Counterintuively we now *expand* the formula into:
// x = x + (i2 * k) + (j * k);
//
// Now observe that (i2 * k) is a constant inside the inner
// loop which we can calculate only once here. Also observe
// that is simply added into x a total (N - i) times, so
// we take advantange of the abelian nature of addition
// to hoist it completely out of the loop
x = x + (i2 * k) * (N - i);
// Observe that inside this loop we calculate (j * k) repeatedly,
// and that j is just an increasing counter. So now instead of
// doing numerous multiplications, let's break the operation into
// two parts: a multiplication, which we hoist out of the inner
// loop and additions which we continue performing in the inner
// loop.
z = i * k;
for (j = i + 1; j <= N; j++)
{
z = z + k;
x = x + z;
}
i++;
i2 += 2;
}
return x;
}
intfoobar(intn)
{//我们消除了未使用的参数
int x=0,i=0,i2=0,j,k,z;
//由于
//最后一次迭代没有做任何有用的事情。我们也保留了
//a的“2*i”(在整个代码中使用)轨迹
//第二个变量“i2”,我们在每个
//迭代,本质上是将乘法转换为加法。
而(i
{
int i,j,x=0;//删除无用变量,操作以保存堆栈和机器周期
for(i=N;i-->)//不要检查不必要的比较条件
对于(j=N+1;--j>i;)
x+=(((i恒定传播、代数简化、复制传播、公共子表达式消除、死代码消除、循环不变消除,以及使用位移位而不是乘法,因为它们成本较低-有趣的事实:这些正是“简单的”现代优化编译器可以自己完成的优化,而且通常比程序员做得更好;因此,程序员通常关心更“高级”(例如算法)的优化(或更微妙的东西,例如与缓存相关的东西,通常需要分析)。告诉编译器你想要所有的优化提示,因为?用-O3
运行gcc
应该会有一些效果。虽然我不确定优化编译器在多大程度上违背了本练习的目的。不幸的是,大量的“HPC编程”课程看起来像是从20世纪70年代直接传送过来的。在那里,完成了。不幸的是
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 4*(2*i+j)*(i+2*k);
}
}
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 8*i*i + 16*i*k ; // multiple of 1 (no j)
x = x + (4*i + 8*k)*j ; // multiple of j
}
}
for (i = 0; i <= N; i++) {
x = x + (8*i*i + 16*i*k) * (N-i) ;
x = x + (4*i + 8*k) * ((N*N+N)/2 - (i*i+i)/2) ;
}
for (i = 0; i <= N; i++) {
x = x + ( 8*k*(N*N+N)/2 ) ;
x = x + i * ( 16*k*N + 4*(N*N+N)/2 + 8*k*(-1/2) ) ;
x = x + i*i * ( 8*N + 16*k*(-1) + 4*(-1/2) + 8*k*(-1/2) );
x = x + i*i*i * ( 8*(-1) + 4*(-1/2) ) ;
}
for (i = 0; i <= N; i++) {
x = x + 4*k*(N*N+N) ; // multiple of 1
x = x + i * ( 16*k*N + 2*(N*N+N) - 4*k ) ; // multiple of i
x = x + i*i * ( 8*N - 20*k - 2 ) ; // multiple of i^2
x = x + i*i*i * ( -10 ) ; // multiple of i^3
}
x = x + ( 4*k*(N*N+N) ) * (N+1) ;
x = x + ( 16*k*N + 2*(N*N+N) - 4*k ) * ((N*(N+1))/2) ;
x = x + ( 8*N - 20*k - 2 ) * ((N*(N+1)*(2*N+1))/6);
x = x + (-10) * ((N*N*(N+1)*(N+1))/4) ;
x = N * (N + 1) * (N * (7 * N + 8187) - 2050) / 6;
sum (sum (8*i*i+4096*i+4*i*j+2048*j), j=i+1..N), i=0..N
int foobar(int N)
{ // We eliminate unused arguments
int x = 0, i = 0, i2 = 0, j, k, z;
// We only iterate up to N on the outer loop, since the
// last iteration doesn't do anything useful. Also we keep
// track of '2*i' (which is used throughout the code) by a
// second variable 'i2' which we increment by two in every
// iteration, essentially converting multiplication into addition.
while(i < N)
{
// We hoist the calculation '4 * (i+2*k)' out of the loop
// since k is a literal constant and 'i' is a constant during
// the inner loop. We could convert the multiplication by 2
// into a left shift, but hey, let's not go *crazy*!
//
// (4 * (i+2*k)) <=>
// (4 * i) + (4 * 2 * k) <=>
// (2 * i2) + (8 * k) <=>
// (2 * i2) + (8 * 512) <=>
// (2 * i2) + 2048
k = (2 * i2) + 2048;
// We have now converted the expression:
// x = x + 4*(2*i+j)*(i+2*k);
//
// into the expression:
// x = x + (i2 + j) * k;
//
// Counterintuively we now *expand* the formula into:
// x = x + (i2 * k) + (j * k);
//
// Now observe that (i2 * k) is a constant inside the inner
// loop which we can calculate only once here. Also observe
// that is simply added into x a total (N - i) times, so
// we take advantange of the abelian nature of addition
// to hoist it completely out of the loop
x = x + (i2 * k) * (N - i);
// Observe that inside this loop we calculate (j * k) repeatedly,
// and that j is just an increasing counter. So now instead of
// doing numerous multiplications, let's break the operation into
// two parts: a multiplication, which we hoist out of the inner
// loop and additions which we continue performing in the inner
// loop.
z = i * k;
for (j = i + 1; j <= N; j++)
{
z = z + k;
x = x + z;
}
i++;
i2 += 2;
}
return x;
}
int foobar(int N)
{
int x = 0, i = 0, i2 = 0, j, k, z;
while(i < N)
{
k = (2 * i2) + 2048;
x = x + (i2 * k) * (N - i);
z = i * k;
for (j = i + 1; j <= N; j++)
{
z = z + k;
x = x + z;
}
i++;
i2 += 2;
}
return x;
}
int i, j, x=0; //Remove unuseful variable, operation so save stack and Machine cycle
for (i = N; i--; ) //Don't check unnecessary comparison condition
for (j = N+1; --j>i; )
x += (((i<<1)+j)*(i+512)<<2); //Save Machine cycle ,Use shift instead of Multiply
return x;