使用ARM NEON执行所需时间比C代码长
我转换了一个Brisk函数,将图像从SSE intrinsics调整为ARM NEON intrinsics,以便在ARM架构上执行它。Brisk使用SSE函数(如果支持),否则使用opencv函数。上证综指当然更快。 我在ARM neon中一步一步地转换了SSE函数,但是当我测量执行时间时,与openCV resize函数的执行时间相比,结果是我的函数慢了0.2ms,而不是0.4ms。代码如下: 苏格兰和南方能源公司: } 手臂霓虹灯:使用ARM NEON执行所需时间比C代码长,arm,sse,neon,intrinsics,Arm,Sse,Neon,Intrinsics,我转换了一个Brisk函数,将图像从SSE intrinsics调整为ARM NEON intrinsics,以便在ARM架构上执行它。Brisk使用SSE函数(如果支持),否则使用opencv函数。上证综指当然更快。 我在ARM neon中一步一步地转换了SSE函数,但是当我测量执行时间时,与openCV resize函数的执行时间相比,结果是我的函数慢了0.2ms,而不是0.4ms。代码如下: 苏格兰和南方能源公司: } 手臂霓虹灯: void halfsample(const cv::Ma
void halfsample(const cv::Mat& srcimg, cv::Mat& dstimg){
const unsigned short leftoverCols = ((srcimg.cols%16)/2);// take care with border...
const bool noleftover = (srcimg.cols%16)==0; // note: leftoverCols can be zero but this still false...
// make sure the destination image is of the right size:
//assert(srcimg.cols/2==dstimg.cols);
//assert(srcimg.rows/2==dstimg.rows);
//int32x4_t zero = vdupq_n_s8(0);
// mask needed later:
//register __m128i mask = _mm_set_epi32 (0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
int32x4_t mask = vdupq_n_s32(0x00FF00FF);
// to be added in order to make successive averaging correct:
int32x4_t ones = vdupq_n_s32(0x11111111);
print128_numhex(mask);
// data pointers:
int32_t* p1=(int32_t*)srcimg.data;
int32_t* p2=(int32_t*)(srcimg.data+srcimg.cols);
int32_t* p_dest=(int32_t*)dstimg.data;
unsigned char* p_dest_char;//=(unsigned char*)p_dest;
int k=0;
// size:
const unsigned int size = (srcimg.cols*srcimg.rows)/16;
const unsigned int hsize = srcimg.cols/16;
int32_t* p_end=p1+size*4;
unsigned int row=0;
const unsigned int end=hsize/2;
bool half_end;
if(hsize%2==0)
half_end=false;
else
half_end=true;
while(p2<p_end){
k++;
for(unsigned int i=0; i<end;i++){
// load the two blocks of memory:
int32x4_t upper;
int32x4_t lower;
if(noleftover){
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
}
else{
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
}
int32x4_t result1=vaddq_s32(upper, ones);
result1=vrhaddq_u8(upper, lower);
// increment the pointers:
p1=p1+4;
p2=p2+4;
// load the two blocks of memory:
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
int32x4_t result2=vaddq_s32(upper, ones);
result2=vrhaddq_u8(upper, lower);
// calculate the shifted versions:
int32x4_t result1_shifted = vextq_u8(result1,vmovq_n_u8(0),1);
int32x4_t result2_shifted = vextq_u8(result2,vmovq_n_u8(0),1);
// pack:
int32x4_t result= vcombine_u8(vqmovn_u16(vandq_u32(result1, mask)),
vqmovn_u16(vandq_u32 (result2, mask)));
int32x4_t result_shifted = vcombine_u8(vqmovn_u16(vandq_u32 (result1_shifted, mask)),
vqmovn_u16(vandq_u32(result2_shifted, mask)));
// average for the second time:
result=vrhaddq_u8(result,result_shifted);
// store to memory
vst1q_s32(p_dest, result);
// increment the pointers:
p1=p1+4;
p2=p2+4;
p_dest=p_dest+4;
//p_dest_char=(unsigned char*)p_dest;
}
// if we are not at the end of the row, do the rest:
if(half_end){
std::cout<<"entra in half_end" << std::endl;
// load the two blocks of memory:
int32x4_t upper;
int32x4_t lower;
if(noleftover){
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
}
else{
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
}
int32x4_t result1=vqaddq_s32(upper, ones);
result1=vrhaddq_u8(upper, lower);
// increment the pointers:
p1=p1+4;
p2=p2+4;
// compute horizontal pairwise average and store
p_dest_char=(unsigned char*)p_dest;
const UCHAR_ALIAS* result=(UCHAR_ALIAS*)&result1;
for(unsigned int j=0; j<8; j++){
*(p_dest_char++)=(*(result+2*j)+*(result+2*j+1))/2;
}
//p_dest_char=(unsigned char*)p_dest;
}
else{
p_dest_char=(unsigned char*)p_dest;
}
if(noleftover){
row++;
p_dest=(int32_t*)(dstimg.data+row*dstimg.cols);
p1=(int32_t*)(srcimg.data+2*row*srcimg.cols);
//p2=(__m128i*)(srcimg.data+(2*row+1)*srcimg.cols);
//p1+=hsize;
p2=p1+hsize*4;
}
else{
const unsigned char* p1_src_char=(unsigned char*)(p1);
const unsigned char* p2_src_char=(unsigned char*)(p2);
for(unsigned int k=0; k<leftoverCols; k++){
unsigned short tmp = p1_src_char[k]+p1_src_char[k+1]+
p2_src_char[k]+p2_src_char[k+1];
*(p_dest_char++)=(unsigned char)(tmp/4);
}
// done with the two rows:
row++;
p_dest=(int32_t*)(dstimg.data+row*dstimg.cols);
p1=(int32_t*)(srcimg.data+2*row*srcimg.cols);
p2=(int32_t*)(srcimg.data+(2*row+1)*srcimg.cols);
}
}
}
ARM和SSE功能的输出完全相同。问题在于执行时间。您应该知道,无论是内部函数还是内联汇编代码,都不能像本机汇编中手工编写的代码那样完美无瑕 更糟糕的是,有时编译器——特别是像GCC这样的开源编译器——会放入一些不必要的指令,导致管道暂停,这需要花费远远超过十个周期的时间。当这种情况发生在最内部的循环中时,对性能来说是致命的
为什么不发布代码的反汇编?有内在问题的人应该先看一看。并尽快停止使用intrinsic您应该从改进代码开始。第一个noleftover部分与if/else相同。您可以两次分配result1,而不使用第一个。不要使用短裤,因为手臂上有32位单词。这其中的大部分可能是由编译器优化的,但无论如何,您应该清除它们,然后使用像DS-5 Streamline这样的探查器-这将使您的任务更容易。社区/免费版本可用。@auselen Yes在剩余部分中,我复制了2次代码。我对它进行了编辑。关于result1,我的操作与原始代码完全相同,但我仍然不清楚为什么要执行该操作,我只是按原样对其进行了转换。我会像您所说的那样尝试使用探查器。您的代码无法编译。所以我想你被一些ifdef愚弄了。@auselen我肯定它是编译的。我使用这些参数-mfloat abi=softfp-mfpu=neon-flash向量转换。使用DS-5 Streamline prifiling的问题是,我使用的是没有图形界面的beagleboard,我不知道是否可以在上面使用DS-5。你可以在beaglebone上捕获数据并在PC主机上进行检查,很简单。这不是答案。我会在OP发布他的反汇编时给出答案:我如何获得反汇编?这是我第一次使用NEON?
void halfsample(const cv::Mat& srcimg, cv::Mat& dstimg){
const unsigned short leftoverCols = ((srcimg.cols%16)/2);// take care with border...
const bool noleftover = (srcimg.cols%16)==0; // note: leftoverCols can be zero but this still false...
// make sure the destination image is of the right size:
//assert(srcimg.cols/2==dstimg.cols);
//assert(srcimg.rows/2==dstimg.rows);
//int32x4_t zero = vdupq_n_s8(0);
// mask needed later:
//register __m128i mask = _mm_set_epi32 (0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
int32x4_t mask = vdupq_n_s32(0x00FF00FF);
// to be added in order to make successive averaging correct:
int32x4_t ones = vdupq_n_s32(0x11111111);
print128_numhex(mask);
// data pointers:
int32_t* p1=(int32_t*)srcimg.data;
int32_t* p2=(int32_t*)(srcimg.data+srcimg.cols);
int32_t* p_dest=(int32_t*)dstimg.data;
unsigned char* p_dest_char;//=(unsigned char*)p_dest;
int k=0;
// size:
const unsigned int size = (srcimg.cols*srcimg.rows)/16;
const unsigned int hsize = srcimg.cols/16;
int32_t* p_end=p1+size*4;
unsigned int row=0;
const unsigned int end=hsize/2;
bool half_end;
if(hsize%2==0)
half_end=false;
else
half_end=true;
while(p2<p_end){
k++;
for(unsigned int i=0; i<end;i++){
// load the two blocks of memory:
int32x4_t upper;
int32x4_t lower;
if(noleftover){
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
}
else{
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
}
int32x4_t result1=vaddq_s32(upper, ones);
result1=vrhaddq_u8(upper, lower);
// increment the pointers:
p1=p1+4;
p2=p2+4;
// load the two blocks of memory:
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
int32x4_t result2=vaddq_s32(upper, ones);
result2=vrhaddq_u8(upper, lower);
// calculate the shifted versions:
int32x4_t result1_shifted = vextq_u8(result1,vmovq_n_u8(0),1);
int32x4_t result2_shifted = vextq_u8(result2,vmovq_n_u8(0),1);
// pack:
int32x4_t result= vcombine_u8(vqmovn_u16(vandq_u32(result1, mask)),
vqmovn_u16(vandq_u32 (result2, mask)));
int32x4_t result_shifted = vcombine_u8(vqmovn_u16(vandq_u32 (result1_shifted, mask)),
vqmovn_u16(vandq_u32(result2_shifted, mask)));
// average for the second time:
result=vrhaddq_u8(result,result_shifted);
// store to memory
vst1q_s32(p_dest, result);
// increment the pointers:
p1=p1+4;
p2=p2+4;
p_dest=p_dest+4;
//p_dest_char=(unsigned char*)p_dest;
}
// if we are not at the end of the row, do the rest:
if(half_end){
std::cout<<"entra in half_end" << std::endl;
// load the two blocks of memory:
int32x4_t upper;
int32x4_t lower;
if(noleftover){
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
}
else{
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
}
int32x4_t result1=vqaddq_s32(upper, ones);
result1=vrhaddq_u8(upper, lower);
// increment the pointers:
p1=p1+4;
p2=p2+4;
// compute horizontal pairwise average and store
p_dest_char=(unsigned char*)p_dest;
const UCHAR_ALIAS* result=(UCHAR_ALIAS*)&result1;
for(unsigned int j=0; j<8; j++){
*(p_dest_char++)=(*(result+2*j)+*(result+2*j+1))/2;
}
//p_dest_char=(unsigned char*)p_dest;
}
else{
p_dest_char=(unsigned char*)p_dest;
}
if(noleftover){
row++;
p_dest=(int32_t*)(dstimg.data+row*dstimg.cols);
p1=(int32_t*)(srcimg.data+2*row*srcimg.cols);
//p2=(__m128i*)(srcimg.data+(2*row+1)*srcimg.cols);
//p1+=hsize;
p2=p1+hsize*4;
}
else{
const unsigned char* p1_src_char=(unsigned char*)(p1);
const unsigned char* p2_src_char=(unsigned char*)(p2);
for(unsigned int k=0; k<leftoverCols; k++){
unsigned short tmp = p1_src_char[k]+p1_src_char[k+1]+
p2_src_char[k]+p2_src_char[k+1];
*(p_dest_char++)=(unsigned char)(tmp/4);
}
// done with the two rows:
row++;
p_dest=(int32_t*)(dstimg.data+row*dstimg.cols);
p1=(int32_t*)(srcimg.data+2*row*srcimg.cols);
p2=(int32_t*)(srcimg.data+(2*row+1)*srcimg.cols);
}
}