小米5S中的arm霓虹灯装配性能问题

小米5S中的arm霓虹灯装配性能问题,arm,neon,Arm,Neon,考虑以下代码,即第一个代码段: void run_new(const float* src, float* dst, size_t IH, size_t IW, size_t OH, size_t OW, size_t N) { rep(n, N) { const float* src_ptr = src + IW * IH * n; float* outptr = dst; const float* r0

考虑以下代码,即第一个代码段:

void run_new(const float* src, float* dst,
        size_t IH, size_t IW, size_t OH, size_t OW,
        size_t N) {
    rep(n, N) {
        const float* src_ptr = src + IW * IH * n;
        float* outptr = dst;

        const float* r0 = src_ptr;
        const float* r1 = src_ptr + IW;

        float32x4_t k0123 = vdupq_n_f32(3.f);
        rep(h, OH) {
            size_t width = OW >> 2;

            asm volatile(
                    "dup v21.4s, %4.s[0] \n"
                    "dup v22.4s, %4.s[1] \n"
                    "dup v23.4s, %4.s[2] \n"
                    "dup v24.4s, %4.s[3] \n"
                    "mov x3, xzr \n"
                    "0:           \n"
                    "ldr q0, [%1] \n"
                    "ld1 {v1.4s, v2.4s}, [%2], #32 \n"

                    "add x3, x3, #0x1 \n"
                    "cmp %0, x3 \n"

                    "ld1 {v3.4s, v4.4s}, [%3], #32 \n"
                    "fmla v0.4s, v1.4s, v21.4s \n"  // src[i] * k[i]
                    "fmla v0.4s, v2.4s, v22.4s \n"

                    "fmla v0.4s, v3.4s, v23.4s \n"
                    "fmla v0.4s, v4.4s, v24.4s \n"

                    "str q0, [%1], #16 \n"
                    "bne 0b \n"
                    : "+r"(width), "+r"(outptr), "+r"(r0), "+r"(r1)
                    : "w"(k0123)
                          : "cc", "memory", "x3", "v0", "v1", "v2", "v3", "v4", "v21", "v22", "v23", "v24");
        }

    }
}
第二个代码段:

 void run_origin(const float* src, float* dst,
        size_t IH, size_t IW, size_t OH, size_t OW,
        size_t N) {

    rep(n, N) {
        const float* src_ptr = src + IW * IH * n;
        float* outptr = dst;

        const float* r0 = src_ptr;
        const float* r1 = src_ptr + IW;

        float32x4_t k0123 = vdupq_n_f32(3.f);
        rep(h, OH) {
            size_t width = OW >> 2;

            asm volatile(
                    "dup v21.4s, %4.s[0] \n"
                    "dup v22.4s, %4.s[1] \n"
                    "dup v23.4s, %4.s[2] \n"
                    "dup v24.4s, %4.s[3] \n"
                    "mov x3, xzr \n"
                    "mov x4, xzr \n"
                    "0:           \n"
                    "add x19, %2, x4 \n"
                    "ldr q0, [%1] \n"  // load dst 0, 1, 2, 3
                    "ld1 {v1.4s, v2.4s}, [x19]\n"  // 1, 2, 4, 6

                    "add x3, x3, #0x1 \n"
                    "cmp %0, x3 \n"

                    "add x19, %3, x4 \n"
                    "ld1 {v3.4s, v4.4s}, [x19]\n"
                    "fmla v0.4s, v1.4s, v21.4s \n"  // src[i] * k[i]
                    "fmla v0.4s, v2.4s, v22.4s \n"

                    "fmla v0.4s, v3.4s, v23.4s \n"
                    "fmla v0.4s, v4.4s, v24.4s \n"

                    "add x4, x4, #0x20 \n"
                    "str q0, [%1], #16 \n"
                    "bne 0b \n"
                    "add %2, %2, x4 \n"
                    "add %3, %3, x4 \n"
                    : "+r"(width), "+r"(outptr), "+r"(r0), "+r"(r1)
                    : "w"(k0123)
                          : "cc", "memory", "x3", "x4", "x19", "v0", "v1", "v2", "v3", "v4", "v21", "v22", "v23", "v24");

        }

    }
}
所有的代码都在

我在
xiaomi5s
xiaomi6
redmi
上测试了这两个代码的性能,具体性能如下:

N:12 IH:224 IW:224 OH:112 OW:112

  • 性能来源:325.35058兆流-新增:4275.63483兆流-加速比:13.14162小米5s
  • 性能来源:3082.00078兆流-新增:3063.45047兆流-加速比:0.99398小米6
  • 性能来源:1761.05058兆流-新增:1814.37185兆流-加速比:1.03028redmi
  • 以下是小米5S的测试

    N:12 IH:48-256 IW:224

  • N:12 IH:48 IW:224 OH:24 OW:112 性能来源:3721.16633 mflops--新:4935.31729 mflops--加速比:1.32628
  • N:12 IH:80 IW:224 OH:40 OW:112 性能原点:1185.58378 mflops--新的:3852.38266 mflops--加速比:3.24936
  • N:12 IH:112 IW:224 OH:56 OW:112 性能原点:1021.83468 mflops--新的:3503.70672 mflops--加速比:3.42884
  • N:12 IH:144 IW:224 OH:72 OW:112 性能原点:797.61461 mflops--新的:4167.12780 mflops--加速比:5.22449
  • N:12 IH:176 IW:224 OH:88 OW:112 性能来源:465.55073兆流-新:4084.54206兆流-加速比:8.77357
  • N:12 IH:208 IW:224 OH:104 OW:112 性能原点:373.99237 mflops--新的:4255.78687 mflops--加速比:11.37934
  • N:12 IH:240 IW:224 OH:120 OW:112 性能原点:341.57406兆流-新增:4290.58840兆流-加速比:12.56122
  • N:12 IH:224 IW:48-256

  • N:12 IH:224 IW:48 OH:112 OW:24 性能来源:3660.35916 mflops--新:4729.61877 mflops--加速比:1.29212
  • N:12 IH:224 IW:80 OH:112 OW:40 性能原点:2918.48755 mflops--新的:4748.17285 mflops--加速比:1.62693
  • N:12 IH:224 IW:112 OH:112 OW:56 性能来源:951.03852 mflops--新的:4051.84318 mflops--加速比:4.26044
  • N:12 IH:224 IW:144 OH:112 OW:72 性能原点:1186.74405 mflops---新:4160.18572 mflops---加速比:3.50555
  • N:12 IH:224 IW:176 OH:112 OW:88 性能原点:533.47286 mflops--新的:4199.36622 mflops--加速比:7.87175
  • N:12 IH:224 IW:208 OH:112 OW:104 性能原点:447.30682兆流-新:4092.22256兆流-加速比:9.14858
  • N:12 IH:224 IW:240 OH:112 OW:120 性能原点:442.58206兆流-新:4200.13672兆流-加速比:9.49007
  • IC:2-12 IH:224 IW:224

  • N:2 IH:224 IW:224 OH:112 OW:112 性能来源:3794.45684 mflops--新:5236.48508 mflops--加速比:1.38004
  • N:3 IH:224 IW:224 OH:112 OW:112 性能原点:3790.20521 mflops---新:5150.30622 mflops---加速比:1.35885
  • N:4 IH:224 IW:224 OH:112 OW:112 性能来源:2117.55521 mflops--新:4329.34274 mflops--加速比:2.04450
  • N:5 IH:224 IW:224 OH:112 OW:112 性能原点:1290.43541兆次/秒---新的:3915.65607兆次/秒---加速比:3.03437
  • N:6 IH:224 IW:224 OH:112 OW:112 性能来源:1038.86926 mflops--新的:3747.69392 mflops--加速比:3.60747
  • N:7 IH:224 IW:224 OH:112 OW:112 性能原点:845.26878兆次-新增:4025.81237兆次-加速比:4.76276
  • N:8 IH:224 IW:224 OH:112 OW:112 性能来源:658.23150 mflops--新的:3971.62335 mflops--加速比:6.03378
  • N:9 IH:224 IW:224 OH:112 OW:112 性能来源:527.99489 mflops--新:4163.94501 mflops--加速比:7.88634
  • N:10 IH:224 IW:224 OH:112 OW:112 性能原点:416.75353 mflops--新的:4119.03296 mflops--加速比:9.88362
  • N:11 IH:224 IW:224 OH:112 OW:112 性能来源:378.38875 mflops--新的:4203.33717 mflops--加速比:11.10852
  • N:12 IH:224 IW:224 OH:112 OW:112 性能原点:350.36924 mflops--新的:4202.19842 mflops--加速比:11.99363
  • 我被
    小米5s
    中的性能测试搞糊涂了,为什么第一个代码在小米5s上的性能这么差

    我猜这可能是由于霓虹灯的管道破裂,如果它等待正常寄存器,例如
    ld1{v3.4s,v4.4s},[x19]
    等待
    x19
    ,这是由
    添加x19,%3,x4
    计算的,但我不是很确定

    补充详情:

  • 小米5S cpu:高通Snapdragon 821
  • 小米6 cpu:高通Snapdragon 835
  • redmi cpu:MediaTek Helio X20
  • 编译选项(clang版本:5.0.0):
    clang++-std=c++11-Ofast


  • 我将
    ldrq0,[%2]
    更改为
    ld1v0.4s,[%2]
    ,但结果是相同的,
    run\u origin
    的性能可能会稍快一些,大约1%-3%
  • N:12 IH:224 IW:224 OH:112 OW:112

    性能原点:342.96631 mflops---asm:4288.51646 mflops---加速比:12.50419


  • 我将fmla v0.4s、v1.4s、v21.4s更改为smlsl2 v0.2d、v1.4s、v21.4s,但结果相同。 N:12 IH:224 IW:224 OH:112 OW:112

  • 性能原点:348.03699 mflops---asm:4245.18804 mflops---加速比:12.19752


  • 我将fmla v0.4s、v1.4s、v21.4s更改为fadd v0.4s、v1.4s、v21.4s,原始代码会更快
  • N:12 IH:224 IW:224 OH:112 OW:112


    性能来源:743.95433 mflops---asm:4756.65769 mflops---加速比:6.39375

    我对NEON64的熟悉程度不如对NEON32的熟悉,但在您的代码中有几件事我不会做:

    • 为什么要使用VFP指令“ldr”?。在VFP和NEON之间切换可能会花费大量的周期,特别是当这些指令是内存访问指令时。两个寄存器共享并不意味着它们是同一个单元。将其更改为LD1。。。。。。4s

    • 您想要32位还是64位?选择x3或w3,并坚持下去

    • 你确定要用fmla进行融合乘法运算吗?也许是,也许不是,但请注意