GCC内部对齐问题
我在使用g++内部函数实现一个简单的非并行平均方法时遇到了一个令人不安的问题。这里是相关的C++部分,其中PoNoMoPosits是数组pHpValues的长度,它本身是双*.< /P>类型的函数参数。GCC内部对齐问题,gcc,assembly,alignment,intrinsics,Gcc,Assembly,Alignment,Intrinsics,我在使用g++内部函数实现一个简单的非并行平均方法时遇到了一个令人不安的问题。这里是相关的C++部分,其中PoNoMoPosits是数组pHpValues的长度,它本身是双*.< /P>类型的函数参数。 inline void arithmeticAvg (double &p_rAvg, const unsigned int p_oNumPoints, const double *p_pValues, double &p_rSum) { unsigned int i; doubl
inline void arithmeticAvg (double &p_rAvg, const unsigned int p_oNumPoints, const double *p_pValues, double &p_rSum)
{
unsigned int i;
double oTmp[2] __attribute__((aligned(16))) = {0,0};
register __m128d XMM0 = _mm_setzero_pd();
register __m128d XMM1;
register __m128d XMM2;
const unsigned int oLooplength = (p_oNumPoints / 4);
for (i=0; i < (oLooplength*4); i += 4)
{
XMM1 = _mm_loadu_pd(&p_pValues[i]);
XMM2 = _mm_loadu_pd(&p_pValues[i+2]);
XMM0 = _mm_add_pd(XMM0, XMM1);
XMM0 = _mm_add_pd(XMM0, XMM2);
}
_mm_storeu_pd(oTmp, XMM0);
p_rAvg = oTmp[0] + oTmp[1];
inline void算术平均值(double&p_rAvg,const unsigned int p_onimpoints,const double*p_pValues,double&p_rSum)
{
无符号整数i;
双oTmp[2]uuu属性uuu((对齐(16))={0,0};
寄存器XMM0=_mm_setzero_pd();
寄存器xmm128d;
寄存器uum128d XMM2;
常量无符号整数长度=(p_onupoints/4);
对于(i=0;i<(olopLength*4);i+=4)
{
XMM1=_mm_loadu_pd(&p_p值[i]);
XMM2=_mm_loadu_pd(&p_p值[i+2]);
XMM0=_mm_add_pd(XMM0,XMM1);
XMM0=_mm_add_pd(XMM0,XMM2);
}
_mm_storeu_pd(oTmp,XMM0);
p_rAvg=oTmp[0]+oTmp[1];
gcc使用选项-O2-S-msse4.1提供以下组件
b8b3c29e: xorpd %xmm0,%xmm0
"b8b3c2a2: movapd %xmm0,-0xa8(%ebp)"
b8b3c2aa: mov 0xc(%ebp),%eax
b8b3c2ad: shr $0x2,%eax
b8b3c2b0: mov %eax,-0x10(%ebp)
b8b3c2b3: movl $0x0,-0xc(%ebp)
b8b3c2ba: jmp 0xb8b3c35c <_ZN8precitec4math13arithmeticAvgERdjPKdS1_+252>
b8b3c2bf: mov -0xc(%ebp),%eax
b8b3c2c2: shl $0x3,%eax
b8b3c2c5: add 0x10(%ebp),%eax
b8b3c2c8: mov %eax,-0x14(%ebp)
b8b3c2cb: mov -0x14(%ebp),%eax
b8b3c2ce: movupd (%eax),%xmm0
"b8b3c2d2: movapd %xmm0,-0xb8(%ebp)"
b8b3c2da: mov -0xc(%ebp),%eax
b8b3c2dd: add $0x2,%eax
b8b3c2e0: shl $0x3,%eax
b8b3c2e3: add 0x10(%ebp),%eax
b8b3c2e6: mov %eax,-0x18(%ebp)
b8b3c2e9: mov -0x18(%ebp),%eax
b8b3c2ec: movupd (%eax),%xmm0
b8b3c2f0: movapd %xmm0,-0xc8(%ebp)
b8b3c2f8: movapd -0xa8(%ebp),%xmm0
b8b3c300: movapd %xmm0,-0x28(%ebp)
b8b3c305: movapd -0xb8(%ebp),%xmm0
b8b3c30d: movapd %xmm0,-0x38(%ebp)
b8b3c312: movapd -0x38(%ebp),%xmm0
b8b3c317: movapd -0x28(%ebp),%xmm1
b8b3c31c: addpd %xmm1,%xmm0
b8b3c320: movapd %xmm0,-0xa8(%ebp)
b8b3c328: movapd -0xa8(%ebp),%xmm0
b8b3c330: movapd %xmm0,-0x48(%ebp)
b8b3c335: movapd -0xc8(%ebp),%xmm0
b8b3c33d: movapd %xmm0,-0x58(%ebp)
b8b3c342: movapd -0x58(%ebp),%xmm0
b8b3c347: movapd -0x48(%ebp),%xmm1
b8b3c34c: addpd %xmm1,%xmm0
b8b3c350: movapd %xmm0,-0xa8(%ebp)
b8b3c29e:xorpd%xmm0,%xmm0
b8b3c2a2:movapd%xmm0,-0xa8(%ebp)
b8b3c2aa:mov 0xc(%ebp),%eax
b8b3c2ad:shr$0x2,%eax
b8b3c2b0:mov%eax,-0x10(%ebp)
b8b3c2b3:movl$0x0,-0xc(%ebp)
b8b3c2ba:jmp 0xb8b3c35c
b8b3c2bf:mov-0xc(%ebp),%eax
b8b3c2c2:shl$0x3,%eax
b8b3c2c5:添加0x10(%ebp),%eax
b8b3c2c8:mov%eax,-0x14(%ebp)
b8b3c2cb:mov-0x14(%ebp),%eax
b8b3c2ce:movupd(%eax),%xmm0
b8b3c2d2:movapd%xmm0,-0xb8(%ebp)
b8b3c2da:mov-0xc(%ebp),%eax
b8b3c2dd:添加$0x2,%eax
b8b3c2e0:shl$0x3,%eax
b8b3c2e3:添加0x10(%ebp),%eax
b8b3c2e6:mov%eax,-0x18(%ebp)
b8b3c2e9:mov-0x18(%ebp),%eax
b8b3c2ec:movupd(%eax),%xmm0
b8b3c2f0:movapd%xmm0,-0xc8(%ebp)
b8b3c2f8:movapd-0xa8(%ebp),%xmm0
b8b3c300:movapd%xmm0,-0x28(%ebp)
b8b3c305:movapd-0xb8(%ebp),%xmm0
b8b3c30d:movapd%xmm0,-0x38(%ebp)
b8b3c312:movapd-0x38(%ebp),%xmm0
b8b3c317:movapd-0x28(%ebp),%xmm1
b8b3c31c:添加PD%xmm1,%xmm0
B8B3320:movapd%xmm0,-0xa8(%ebp)
B8B328:movapd-0xa8(%ebp),%xmm0
B8B3330:movapd%xmm0,-0x48(%ebp)
b8b3c335:movapd-0xc8(%ebp),%xmm0
b8b3c33d:movapd%xmm0,-0x58(%ebp)
b8b3c342:movapd-0x58(%ebp),%xmm0
b8b3c347:movapd-0x48(%ebp),%xmm1
b8b3c34c:添加PD%xmm1,%xmm0
b8b3c350:movapd%xmm0,-0xa8(%ebp)
尽管循环部分的转换非常糟糕(我尝试使用asm(“xmm”)属性,但没有成功),这会在引号中的行处不断崩溃。调试时,在执行_mm_setzero_pd()命令时会发生这种情况,我认为这是由于重新排序
VS 2010在不破坏任何东西的情况下将其翻译成非常不同的内容
知道问题是什么以及如何解决问题吗
谢谢你的帮助,
G.请指定gcc的输出--version。我刚刚试过使用4.7.2,生成的汇编程序简洁正确。嗨,版本是4.6.1,我必须使用它,因为我们使用的是Momentics chaintool…注释2:我最困惑的是,未对齐的命令被完全忽略了…注释3:我添加了function definition.gcc 4.4.5也可以正常工作。顺便说一下,gcc有向量扩展,如果可能的话,您可能希望使用它们而不是内部函数。