小米5S中的arm霓虹灯装配性能问题
考虑以下代码,即第一个代码段:小米5S中的arm霓虹灯装配性能问题,arm,neon,Arm,Neon,考虑以下代码,即第一个代码段: void run_new(const float* src, float* dst, size_t IH, size_t IW, size_t OH, size_t OW, size_t N) { rep(n, N) { const float* src_ptr = src + IW * IH * n; float* outptr = dst; const float* r0
void run_new(const float* src, float* dst,
size_t IH, size_t IW, size_t OH, size_t OW,
size_t N) {
rep(n, N) {
const float* src_ptr = src + IW * IH * n;
float* outptr = dst;
const float* r0 = src_ptr;
const float* r1 = src_ptr + IW;
float32x4_t k0123 = vdupq_n_f32(3.f);
rep(h, OH) {
size_t width = OW >> 2;
asm volatile(
"dup v21.4s, %4.s[0] \n"
"dup v22.4s, %4.s[1] \n"
"dup v23.4s, %4.s[2] \n"
"dup v24.4s, %4.s[3] \n"
"mov x3, xzr \n"
"0: \n"
"ldr q0, [%1] \n"
"ld1 {v1.4s, v2.4s}, [%2], #32 \n"
"add x3, x3, #0x1 \n"
"cmp %0, x3 \n"
"ld1 {v3.4s, v4.4s}, [%3], #32 \n"
"fmla v0.4s, v1.4s, v21.4s \n" // src[i] * k[i]
"fmla v0.4s, v2.4s, v22.4s \n"
"fmla v0.4s, v3.4s, v23.4s \n"
"fmla v0.4s, v4.4s, v24.4s \n"
"str q0, [%1], #16 \n"
"bne 0b \n"
: "+r"(width), "+r"(outptr), "+r"(r0), "+r"(r1)
: "w"(k0123)
: "cc", "memory", "x3", "v0", "v1", "v2", "v3", "v4", "v21", "v22", "v23", "v24");
}
}
}
第二个代码段:
void run_origin(const float* src, float* dst,
size_t IH, size_t IW, size_t OH, size_t OW,
size_t N) {
rep(n, N) {
const float* src_ptr = src + IW * IH * n;
float* outptr = dst;
const float* r0 = src_ptr;
const float* r1 = src_ptr + IW;
float32x4_t k0123 = vdupq_n_f32(3.f);
rep(h, OH) {
size_t width = OW >> 2;
asm volatile(
"dup v21.4s, %4.s[0] \n"
"dup v22.4s, %4.s[1] \n"
"dup v23.4s, %4.s[2] \n"
"dup v24.4s, %4.s[3] \n"
"mov x3, xzr \n"
"mov x4, xzr \n"
"0: \n"
"add x19, %2, x4 \n"
"ldr q0, [%1] \n" // load dst 0, 1, 2, 3
"ld1 {v1.4s, v2.4s}, [x19]\n" // 1, 2, 4, 6
"add x3, x3, #0x1 \n"
"cmp %0, x3 \n"
"add x19, %3, x4 \n"
"ld1 {v3.4s, v4.4s}, [x19]\n"
"fmla v0.4s, v1.4s, v21.4s \n" // src[i] * k[i]
"fmla v0.4s, v2.4s, v22.4s \n"
"fmla v0.4s, v3.4s, v23.4s \n"
"fmla v0.4s, v4.4s, v24.4s \n"
"add x4, x4, #0x20 \n"
"str q0, [%1], #16 \n"
"bne 0b \n"
"add %2, %2, x4 \n"
"add %3, %3, x4 \n"
: "+r"(width), "+r"(outptr), "+r"(r0), "+r"(r1)
: "w"(k0123)
: "cc", "memory", "x3", "x4", "x19", "v0", "v1", "v2", "v3", "v4", "v21", "v22", "v23", "v24");
}
}
}
所有的代码都在
我在xiaomi5s
、xiaomi6
、redmi
上测试了这两个代码的性能,具体性能如下:
N:12 IH:224 IW:224 OH:112 OW:112
小米5s
中的性能测试搞糊涂了,为什么第一个代码在小米5s上的性能这么差
我猜这可能是由于霓虹灯的管道破裂,如果它等待正常寄存器,例如ld1{v3.4s,v4.4s},[x19]
等待x19
,这是由添加x19,%3,x4
计算的,但我不是很确定
补充详情:
clang++-std=c++11-Ofast
ldrq0,[%2]
更改为ld1v0.4s,[%2]
,但结果是相同的,run\u origin
的性能可能会稍快一些,大约1%-3%性能来源:743.95433 mflops---asm:4756.65769 mflops---加速比:6.39375我对NEON64的熟悉程度不如对NEON32的熟悉,但在您的代码中有几件事我不会做:
- 为什么要使用VFP指令“ldr”?。在VFP和NEON之间切换可能会花费大量的周期,特别是当这些指令是内存访问指令时。两个寄存器共享并不意味着它们是同一个单元。将其更改为LD1。。。。。。4s
- 您想要32位还是64位?选择x3或w3,并坚持下去
- 你确定要用fmla进行融合乘法运算吗?也许是,也许不是,但请注意