尝试将SSE2快速拐角得分代码转换为ARM Neon

尝试将SSE2快速拐角得分代码转换为ARM Neon,arm,sse,neon,computer-vision,Arm,Sse,Neon,Computer Vision,我试图使用ARM Neon指令移植一些SSE2代码(快速角点检测器分数计算)。代码乍一看非常简单,但由于某些原因,结果不同。问题是,有时差异可能非常显著,有时相差2或3个值。如果有人能解释为什么会这样,那就太好了 这是密码 原SSE2: __m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000); for( k = 0; k < 16; k += 8 ) { __m128i v0 = _mm_loadu_si128((

我试图使用ARM Neon指令移植一些SSE2代码(快速角点检测器分数计算)。代码乍一看非常简单,但由于某些原因,结果不同。问题是,有时差异可能非常显著,有时相差2或3个值。如果有人能解释为什么会这样,那就太好了

这是密码

原SSE2:

__m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000);
for( k = 0; k < 16; k += 8 )
{
    __m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1));
    __m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2));
    __m128i a = _mm_min_epi16(v0, v1);
    __m128i b = _mm_max_epi16(v0, v1);
    v0 = _mm_loadu_si128((__m128i*)(d+k+3));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k+4));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k+5));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k+6));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k+7));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k+8));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k));
    q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
    q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
    v0 = _mm_loadu_si128((__m128i*)(d+k+9));
    q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
    q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
}
q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1));
q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2));
threshold = (short)_mm_cvtsi128_si32(q0) - 1;
int16x8_t q0 = vdupq_n_s16(-1000), q1 = vdupq_n_s16(1000);
int16x8_t zero = vdupq_n_s16(0);
for( k = 0; k < 16; k += 8 )
{
    int16x8_t v0 = vld1q_s16((const int16_t*)(d+k+1));
    int16x8_t v1 = vld1q_s16((const int16_t*)(d+k+2));
    int16x8_t a = vminq_s16(v0, v1);
    int16x8_t b = vmaxq_s16(v0, v1);
    v0 = vld1q_s16((const int16_t*)(d+k+3));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k+4));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k+5));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k+6));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k+7));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k+8));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k));
    q0 = vmaxq_s16(q0, vminq_s16(a, v0));
    q1 = vminq_s16(q1, vmaxq_s16(b, v0));
    v0 = vld1q_s16((const int16_t*)(d+k+9));
    q0 = vmaxq_s16(q0, vminq_s16(a, v0));
    q1 = vminq_s16(q1, vmaxq_s16(b, v0));
}
q0 = vmaxq_s16(q0, vsubq_s16(zero, q1));
// first mistake it produce wrong result
//q0 = vmaxq_s16(q0, vzipq_s16(q0, q0).val[1]);
// may be someone knows faster/better way?
int16x4_t a_hi = vget_high_s16(q0);
q1 = vcombine_s16(a_hi, a_hi);
q0 = vmaxq_s16(q0, q1);

// this is _mm_srli_si128(q0, 4)
q1 = vextq_s16(q0, zero, 2);

q0 = vmaxq_s16(q0, q1);

// this is _mm_srli_si128(q0, 2)
q1 = vextq_s16(q0, zero, 1);
q0 = vmaxq_s16(q0, q1);

// read the result
int16_t __attribute__ ((aligned (16))) x[8];
vst1q_s16(x, q0);

threshold = x[0] - 1;
\uuuum128i q0=\umm\uset1\uepi16(-1000),q1=\umm\uset1\uepi16(1000);
对于(k=0;k<16;k+=8)
{
__m128i v0=_mm_loadu_si128((u m128i*)(d+k+1));
__m128i v1=_mm_loadu_si128(__m128i*)(d+k+2));
__m128i a=_mm_min_epi16(v0,v1);
__m128i b=_mm_max_epi16(v0,v1);
v0=_mm_loadu_si128((_m128i*)(d+k+3));
a=_mm_min_epi16(a,v0);
b=毫米最大值epi16(b,v0);
v0=毫米荷载si128(((m128i*)(d+k+4));
a=_mm_min_epi16(a,v0);
b=毫米最大值epi16(b,v0);
v0=毫米荷载si128(((m128i*)(d+k+5));
a=_mm_min_epi16(a,v0);
b=毫米最大值epi16(b,v0);
v0=毫米荷载si128(((m128i*)(d+k+6));
a=_mm_min_epi16(a,v0);
b=毫米最大值epi16(b,v0);
v0=毫米荷载si128(((m128i*)(d+k+7));
a=_mm_min_epi16(a,v0);
b=毫米最大值epi16(b,v0);
v0=毫米荷载si128(((m128i*)(d+k+8));
a=_mm_min_epi16(a,v0);
b=毫米最大值epi16(b,v0);
v0=毫米荷载si128(((m128i*)(d+k));
q0=_mm_max_epi16(q0,_mm_min_epi16(a,v0));
q1=最小值epi16(q1,最大值epi16(b,v0));
v0=毫米荷载si128(((m128i*)(d+k+9));
q0=_mm_max_epi16(q0,_mm_min_epi16(a,v0));
q1=最小值epi16(q1,最大值epi16(b,v0));
}
q0=_mm_max_epi16(q0,_mm_sub_epi16(_mm_setzero_si128(),q1));
q0=_-mm_-max_-epi16(q0,_-mm_-epi64(q0,q0));
q0=_-mm_-max_-epi16(q0,_-mm_-srli_-si128(q0,4));
q0=_mm_max_epi16(q0,_mm_srli_si128(q0,2));
阈值=(短)_mm_cvtsi128_si32(q0)-1;
这是工作臂霓虹灯代码:

__m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000);
for( k = 0; k < 16; k += 8 )
{
    __m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1));
    __m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2));
    __m128i a = _mm_min_epi16(v0, v1);
    __m128i b = _mm_max_epi16(v0, v1);
    v0 = _mm_loadu_si128((__m128i*)(d+k+3));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k+4));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k+5));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k+6));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k+7));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k+8));
    a = _mm_min_epi16(a, v0);
    b = _mm_max_epi16(b, v0);
    v0 = _mm_loadu_si128((__m128i*)(d+k));
    q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
    q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
    v0 = _mm_loadu_si128((__m128i*)(d+k+9));
    q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
    q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
}
q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1));
q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2));
threshold = (short)_mm_cvtsi128_si32(q0) - 1;
int16x8_t q0 = vdupq_n_s16(-1000), q1 = vdupq_n_s16(1000);
int16x8_t zero = vdupq_n_s16(0);
for( k = 0; k < 16; k += 8 )
{
    int16x8_t v0 = vld1q_s16((const int16_t*)(d+k+1));
    int16x8_t v1 = vld1q_s16((const int16_t*)(d+k+2));
    int16x8_t a = vminq_s16(v0, v1);
    int16x8_t b = vmaxq_s16(v0, v1);
    v0 = vld1q_s16((const int16_t*)(d+k+3));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k+4));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k+5));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k+6));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k+7));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k+8));
    a = vminq_s16(a, v0);
    b = vmaxq_s16(b, v0);
    v0 = vld1q_s16((const int16_t*)(d+k));
    q0 = vmaxq_s16(q0, vminq_s16(a, v0));
    q1 = vminq_s16(q1, vmaxq_s16(b, v0));
    v0 = vld1q_s16((const int16_t*)(d+k+9));
    q0 = vmaxq_s16(q0, vminq_s16(a, v0));
    q1 = vminq_s16(q1, vmaxq_s16(b, v0));
}
q0 = vmaxq_s16(q0, vsubq_s16(zero, q1));
// first mistake it produce wrong result
//q0 = vmaxq_s16(q0, vzipq_s16(q0, q0).val[1]);
// may be someone knows faster/better way?
int16x4_t a_hi = vget_high_s16(q0);
q1 = vcombine_s16(a_hi, a_hi);
q0 = vmaxq_s16(q0, q1);

// this is _mm_srli_si128(q0, 4)
q1 = vextq_s16(q0, zero, 2);

q0 = vmaxq_s16(q0, q1);

// this is _mm_srli_si128(q0, 2)
q1 = vextq_s16(q0, zero, 1);
q0 = vmaxq_s16(q0, q1);

// read the result
int16_t __attribute__ ((aligned (16))) x[8];
vst1q_s16(x, q0);

threshold = x[0] - 1;
int16x8_t q0=vdupq_n_s16(-1000),q1=vdupq_n_s16(1000);
int16x8_t zero=vdupq_n_s16(0);
对于(k=0;k<16;k+=8)
{
int16x8_t v0=vld1q_s16((常数int16_t*)(d+k+1));
int16x8_t v1=vld1q_s16((常数int16_t*)(d+k+2));
int16x8_t a=vminq_s16(v0,v1);
int16x8_t b=vmaxq_s16(v0,v1);
v0=vld1q_s16((常数int16_t*)(d+k+3));
a=vminq_s16(a,v0);
b=vmaxq_s16(b,v0);
v0=vld1q_s16((常数int16_t*)(d+k+4));
a=vminq_s16(a,v0);
b=vmaxq_s16(b,v0);
v0=vld1q_s16((常数int16_t*)(d+k+5));
a=vminq_s16(a,v0);
b=vmaxq_s16(b,v0);
v0=vld1q_s16((常数int16_t*)(d+k+6));
a=vminq_s16(a,v0);
b=vmaxq_s16(b,v0);
v0=vld1q_s16((常数int16_t*)(d+k+7));
a=vminq_s16(a,v0);
b=vmaxq_s16(b,v0);
v0=vld1q_s16((常数int16_t*)(d+k+8));
a=vminq_s16(a,v0);
b=vmaxq_s16(b,v0);
v0=vld1q_s16((常数int16_t*)(d+k));
q0=vmaxq_s16(q0,vminq_s16(a,v0));
q1=vminq_s16(q1,vmaxq_s16(b,v0));
v0=vld1q_s16((常数int16_t*)(d+k+9));
q0=vmaxq_s16(q0,vminq_s16(a,v0));
q1=vminq_s16(q1,vmaxq_s16(b,v0));
}
q0=vmaxq_s16(q0,vsubq_s16(零,q1));
//第一个错误会产生错误的结果
//q0=vmaxq_s16(q0,vzipq_s16(q0,q0).val[1]);
//也许有人知道更快/更好的方法?
int16x4_t a_hi=vget_high_s16(q0);
q1=vcombine_s16(a_hi,a_hi);
q0=vmaxq_s16(q0,q1);
//这是_mm_srli_si128(q0,4)
q1=vextq_s16(q0,0,2);
q0=vmaxq_s16(q0,q1);
//这是_mm_srli_si128(q0,2)
q1=vextq_s16(q0,0,1);
q0=vmaxq_s16(q0,q1);
//阅读结果
int16_t__属性__((对齐(16)))x[8];
vst1q_s16(x,q0);
阈值=x[0]-1;
这段代码似乎是正确的,所以如果有人发现它有用。。。
快速拐角检测器是目前最快的。有了这样小的优化,它可能在手机上运行得更快。

你的代码片段中的一些注释会有所帮助。你可以使用vextq和空寄存器来模拟你的_mm_srli_si128内在特性。嗯,我现在对_mm_srli_si128没有问题。问题是,与原始版本相比,整个代码有时返回错误的结果……构建一个结果不同的测试用例,找出结果不同的指令,然后修复问题。vmin和vmax就像SSE一样工作,所以很可能只是代码中有一点输入错误。。我认为问题在于SSE版本中的最终加载到整数。但如果我试着从我的数据中得到整数,结果会是非常大的数。。。顺便说一句,你能看看我加的霓虹灯吗?这是你刚才说的吗?塔克斯!