C++ 霓虹灯增加运行时间_C++_Neon

C++ 霓虹灯增加运行时间

c++

C++ 霓虹灯增加运行时间,c++,neon,C++,Neon,我目前正在尝试优化我的一些图像处理代码，以使用霓虹灯指令假设我有非常大的浮点数组，我想把第一个数组的每个值乘以第二个数组的三个连续值。（第二个是三倍大。） float*l_ptrGauss_pf32=[…]；浮点*l_ptrLaplace_pf32=[…]；//三倍大对于（uint64\u t k=0；k

我目前正在尝试优化我的一些图像处理代码，以使用霓虹灯指令

假设我有非常大的浮点数组，我想把第一个数组的每个值乘以第二个数组的三个连续值。（第二个是三倍大。）

float*l_ptrGauss_pf32=[…]；
浮点*l_ptrLaplace_pf32=[…]；//三倍大
对于（uint64\u t k=0；k


因此，当我用NEON Intrinsic替换上述代码时，运行时间大约要长10%
float32x4_t l_gaussElem_f32x4;
float32x4_t l_laplElem1_f32x4;
float32x4_t l_laplElem2_f32x4;
float32x4_t l_laplElem3_f32x4;

for( uint64_t k=0; k<(l_lastPixelInBlock_ui64/4); ++k)
{
    l_gaussElem_f32x4 = vld1q_f32(l_ptrGauss_pf32);
    l_laplElem1_f32x4 = vld1q_f32(l_ptrLaplace_pf32);
    l_laplElem2_f32x4 = vld1q_f32(l_ptrLaplace_pf32+4);
    l_laplElem3_f32x4 = vld1q_f32(l_ptrLaplace_pf32+8);

    l_laplElem1_f32x4 = vmulq_f32(l_gaussElem_f32x4, l_laplElem1_f32x4);
    l_laplElem2_f32x4 = vmulq_f32(l_gaussElem_f32x4, l_laplElem2_f32x4);
    l_laplElem3_f32x4 = vmulq_f32(l_gaussElem_f32x4, l_laplElem3_f32x4);

    vst1q_f32(l_ptrLaplace_pf32,   l_laplElem1_f32x4);
    vst1q_f32(l_ptrLaplace_pf32+4, l_laplElem2_f32x4);
    vst1q_f32(l_ptrLaplace_pf32+8, l_laplElem3_f32x4);

    l_ptrLaplace_pf32 += 12;
    l_ptrGauss_pf32   += 4;
}

float32x4\u t l\u gaussElem\u f32x4；
浮体32x4_t l_laplElem1_f32x4；
浮体32x4_t l_laplElem2_f32x4；
浮体32x4_t l_laplElem3_f32x4；
对于（uint64_t k=0；k您的代码包含相对较多的向量加载操作和少量乘法操作。因此我建议优化向量加载。有两个步骤：

在阵列中使用对齐内存
使用预取

为此，我建议使用下一个功能：
inline float32x4_t Load(const float * p)
{
    // use prefetch:
    __builtin_prefetch(p + 256); 
    // tell compiler that address is aligned:
    float * _p = (float *)__builtin_assume_aligned(p, 16);
    return vld1q_f32(_p);
}

你检查过编译器生成的代码吗？Clang确实使用Neon指令（假设你“让它”）而不必求助于内部函数，而且通常比使用内部函数做得更好（因为它在如何安排指令和操作方面更聪明，而内部函数往往被视为“必须完全如此”）。也可能是由于手动展开，循环的展开或其他重新排列变得不同。这是针对armv7还是arm64的？同样，小切线，但对于k
和l\u lastPixelInBlock\u ui64
（因为在armv7上是32位的）可能最好使用size\u t
。此外，指针是否可以别名？如果不能，将其标记为restrict，应该会有所帮助。
inline float32x4_t Load(const float * p)
{
    // use prefetch:
    __builtin_prefetch(p + 256); 
    // tell compiler that address is aligned:
    float * _p = (float *)__builtin_assume_aligned(p, 16);
    return vld1q_f32(_p);
}