C 图像处理中的边界检查_C_Arm_Simd_Neon

C 图像处理中的边界检查

c arm

C 图像处理中的边界检查,c,arm,simd,neon,C,Arm,Simd,Neon,在处理图像处理中的任何过滤器时，我希望注意边界条件。我正在外推边界并创建新边界。例如，我有4x3输入： //Input int image[4][3] = 1 2 3 4 2 4 6 8 3 6 9 12 //Output int extensionimage[6][5] = 1 1 2 3 4 4 1 1 2 3 4 4 2 2 4 6 8 8 3 3 6 9 12 12 3 3 6 9 12 12 我的代码： #include <stdio.h> #include

在处理图像处理中的任何过滤器时，我希望注意边界条件。我正在外推边界并创建新边界。例如，我有4x3输入：

//Input
int image[4][3] = 
1 2 3 4 
2 4 6 8 
3 6 9 12

//Output
int extensionimage[6][5] =
1 1 2 3 4 4
1 1 2 3 4 4 
2 2 4 6 8 8
3 3 6 9 12 12
3 3 6 9 12 12

我的代码：

#include <stdio.h> 
#include <string.h> 
#include <stdlib.h> 

void padd_border(int *img,int *extension,int width,int height);

int main(){
    int width = 4,height = 3;
    int *img =  new int[(width) * (height)];
    for(int j = 0;j < height; j++){
        for(int i = 0;i < width; i++){
            img[j*width + i] = (i+1)*(j+1);
            printf("%d\t",img[j*width + i]);
        }
    }
    //Allocate memory for signal extension
    int *extension =  new int[(width + 2) * (height + 2)];

    //Check memory allocation
    if (!extension)
        return 0;

    // init to zero
    memset(extension, 0, sizeof(int)*(width + 2) * (height + 2));

    //Padd the input for border conditions
    padd_border(img,extension,width,height);
    //HERE using "extension" input for dummy functionality 

    delete[] extension;
    delete[] img;

    return 0;
}

void padd_border(int *image,int *extension,int width,int height){

    //   Create image extension
    for (int i = 0; i < height; ++i)
    {
        memcpy(extension + (width + 2) * (i + 1) + 1, image + width * i, width * sizeof(int));
        extension[(width + 2) * (i + 1)] = image[width * i];
        extension[(width + 2) * (i + 2) - 1] = image[width * (i + 1) - 1];
    }

    //   Fill first line of image extension
    memcpy(extension, extension + width + 2, (width + 2) * sizeof(int));
    //   Fill last line of image extension
    memcpy(extension + (width + 2) * (height + 1), extension + (width + 2) * height, (width + 2) * sizeof(int));
}

我要执行此操作以获取目标：

 -1*v_m1_m1 + 0*v_m1_0 + 1*v_m1_p1
 -1*v_0_m1  + 0*v_0_0  + 1*v_0_p1       ->V_OUT
 -1*v_p1_m1 + 0*v_p1_0 + 1*v_p1_p1

更改边界代码后，我得到以下值：

    221 221 221 221    221 220 221 223   230 233 234 235
    221 221 221 221    221 220 221 223   230 233 234 235
    71  71  71  71     71  73  70  70    92  130 141 143

在标量代码中，如果我想计算221（@I，j=0,0），带边框的情况如下：

 221 221 220
 221 221 220
 71  71  73

但随着霓虹灯的矢量化，我得到了错误的答案

v_m1_m1.0  v_m1_0.1  v_m1_p1.2
v_0_m1.0   v_0_0.1   v_0_p1.2
v_p1_m1.0  v_p1_0.1  v_p1_p1.2


221 221 230 
221 221 230
71  71  92

我的伪代码：

for i = 0 to nrows - 1
        // init row pointers
        p_row_m1 = src + src_width * MAX(i-1, 0);           // pointing to minus1 row
        p_row_0  = src + src_width * i;                     // pointing to current row
        p_row_p1 = src + src_width * MIN(i+1, src_width-1); // pointing to plus1 row

        v_m1_m1 = vdupq_n_u32(p_row_m1[0]);   // fill left vector from src[i-1][0]
        v_0_m1  = vdupq_n_u32(p_row_0[0]);    // fill left vector from src[i][0]
        v_p1_m1 = vdupq_n_u32(p_row_p1[0]);   // fill left vector from src[i+1][0]

        v_m1_0 = vld1q_u32(&p_row_m1[0]);   // load center vector from src[i-1][0..7]
        v_0_0  = vld1q_u32(&p_row_0[0]);    // load center vector from src[i][0..7]
        v_p1_0 = vld1q_u32(&p_row_p1[0]);   // load center vector from src[i+1][0..7]

        for j = 0 to (ncols - 4) step 4         // assuming 4 elements per SIMD vector

            v_m1_p1  = vld1q_u32(&p_row_m1[j+4]);   // load right vector from src[i-1][0..7]
            v_0_p1   = vld1q_u32(&p_row_0[j+4]);    // load right vector from src[i][0..7]
            v_p1_p1  = vld1q_u32(&p_row_p1[j+4]);   // load right vector from src[i+1][0..7]
    //
    // you now have a 3x3 arrangement of vectors on which
    // you can perform a neighbourhood operation and generate
    // 16 output pixels for the current iteration:
    //
    //    v_m1_m1  v_m1_0  v_m1_p1
    //    v_0_m1   v_0_0   v_0_p1
    //    v_p1_m1  v_p1_0  v_p1_p1
    //
    //               |
    //               V
    //
    //              v_out
    vst1q_s32(v_out, &image_out[i][j])      // store output vector at image_out[i][j..j+15]
    // shuffle vectors so that we can use them on next iteration
    v_m1_m1 = v_m1_0
    v_m1_0  = v_m1_p1

    v_0_m1  = v_0_0 
    v_0_0   = v_0_p1

    v_p1_m1 = v_p1_0
    v_p1_0  = v_p1_p1

  end_for
  // for final iteration we need to handle right edge pixels...
  v_m1_p1 = vdupq_n_u32(p_row_m1[ncols-1])     // fill right vector from image[i-1][ncols-1]
  v_0_p1  = vdupq_n_u32(p_row_0[ncols-1])       // fill right vector from image[i][ncols-1]
  v_p1_p1 = vdupq_n_u32(p_row_p1[ncols-1])     // fill right vector from image[i+1][ncols-1]
  // calculate v_out as above
  vst1q_s32(v_out, &image_out[i][j])        // store output vector at image_out[i][ncols_16..ncols-1]
end_for

下面是一些伪代码，用于使用具有复制边缘像素的SIMD执行3x3邻域操作。输入图像是

image[nrows][ncols]

，输出图像是

image\u out[nrows][ncols]

for i = 0 to nrows - 1
  // init row pointers
  p_row_m1 = &image[max(i-1, 0)][0]         // pointer to start of row i-1
  p_row_0 = &image[i][0]                    // pointer to start of row i
  p_row_p1 = &image[min(i+1, ncols-1)][0]   // pointer to start of row i+1
  v_m1_m1 = init_vec(p_row_m1[0])           // fill left vector from image[i-1][0]
  v_0_m1 = init_vec(p_row_0[0])             // fill left vector from image[i][0]
  v_p1_m1 = init_vec(p_row_p1[0])           // fill left vector from image[i+1][0]
  v_m1_0 = load_vec(&p_row_m1[0])           // load centre vector from image[i-1][0..15]
  v_0_0 = load_vec(&p_row_0[0])             // load centre vector from image[i][0..15]
  v_p1_0 = load_vec(&p_row_p1[0])           // load centre vector from image[i+1][0..15]
  for j = 0 to (ncols - 16) step 16         // assuming 16 elements per SIMD vector
    v_m1_p1 = load_vec(&p_row_m1[j+16])     // load right vector from image[i-1][0..15]
    v_0_p1 = load_vec(&p_row_0[j+16])       // load right vector from image[i][0..15]
    v_p1_p1 = load_vec(&p_row_p1[j+16])     // load right vector from image[i+1][0..15]
    //
    // you now have a 3x3 arrangement of vectors on which
    // you can perform a neighbourhood operation and generate
    // 16 output pixels for the current iteration:
    //
    //    v_m1_m1  v_m1_0  v_m1_p1
    //    v_0_m1   v_0_0   v_0_p1
    //    v_p1_m1  v_p1_0  v_p1_p1
    //
    //               |
    //               V
    //
    //              v_out
    //
    store_vec(v_out, &image_out[i][j])      // store output vector at image_out[i][j..j+15]
    // shuffle vectors so that we can use them on next iteration
    v_m1_m1 = v_m1_0
    v_m1_0  = v_m1_p1
    v_0_m1  = v_0_0 
    v_0_0   = v_0_p1
    v_p1_m1 = v_p1_0
    v_p1_0  = v_p1_p1
  end_for
  // for final iteration we need to handle right edge pixels...
  v_m1_p1 = init_vec(p_row_m1[ncols-1])     // fill right vector from image[i-1][ncols-1]
  v_0_p1 = init_vec(p_row_0[ncols-1])       // fill right vector from image[i][ncols-1]
  v_p1_p1 = init_vec(p_row_p1[ncols-1])     // fill right vector from image[i+1][ncols-1]
  // calculate v_out as above
  store_vec(v_out, &image_out[i][j])        // store output vector at image_out[i][ncols_16..ncols-1]
end_for

请注意，这假设每个向量16个像素，并且

ncols

是16的倍数。

在Y轴上执行此操作相当容易（只需从剪裁的Y索引加载相邻的边界向量），但在X轴上有点麻烦。对于X轴，在进入X轴循环之前，通常需要初始化负索引的X邻域向量。如果我有时间，我会发布一个详细的答案和一些例子。@PaulR，我从你的回复中了解到以下几点1）我必须使用扩展缓冲区。我无法利用这个额外的内存创建。（2）使用当前的实现逻辑（根据我的代码），我可以按照您的建议进行操作。不，您不需要额外的缓冲区-我提到的邻域向量只是Neon（128位）变量。Paul如果您不介意的话，可以解释一下！请注意，StackOverflow并不是一个讨论论坛——请不要把新问题附加到现有问题上。我为你回答了最初的问题-如果你对答案感到满意，那么你应该投票和/或接受它，然后继续前进。如果你有更多的问题，那么你应该开始一个或多个新问题（我很乐意看看，如果可以的话，试着回答）。我不明白这个逻辑。为什么它指向同一行第一个元素？？当i=0时，p_row_p1=图像[min（i+1，ncols-1）]；image[min（1，ncols-1）]=不是image[1][0]的image[1]！！因为希望为第一行复制边缘像素（当i=0时），所以前两行指针指向同一行。当i=1时，它们指向不同的行。我编辑了行指针初始化，使其更清晰。我认为图像[width*1]将指向第二行，而不是图像[1]！！我在几分钟前编辑了它，以使行指针初始化的意图更清晰-逻辑是相同的，但希望现在不再那么混乱。我可以这样写：p_row_m1=图像+nclos*max（I-1，0）p_row_0=图像+nclos*[I]p_row_p1=图像+nclos*（最小值（i+1，ncols-1））

for i = 0 to nrows - 1
  // init row pointers
  p_row_m1 = &image[max(i-1, 0)][0]         // pointer to start of row i-1
  p_row_0 = &image[i][0]                    // pointer to start of row i
  p_row_p1 = &image[min(i+1, ncols-1)][0]   // pointer to start of row i+1
  v_m1_m1 = init_vec(p_row_m1[0])           // fill left vector from image[i-1][0]
  v_0_m1 = init_vec(p_row_0[0])             // fill left vector from image[i][0]
  v_p1_m1 = init_vec(p_row_p1[0])           // fill left vector from image[i+1][0]
  v_m1_0 = load_vec(&p_row_m1[0])           // load centre vector from image[i-1][0..15]
  v_0_0 = load_vec(&p_row_0[0])             // load centre vector from image[i][0..15]
  v_p1_0 = load_vec(&p_row_p1[0])           // load centre vector from image[i+1][0..15]
  for j = 0 to (ncols - 16) step 16         // assuming 16 elements per SIMD vector
    v_m1_p1 = load_vec(&p_row_m1[j+16])     // load right vector from image[i-1][0..15]
    v_0_p1 = load_vec(&p_row_0[j+16])       // load right vector from image[i][0..15]
    v_p1_p1 = load_vec(&p_row_p1[j+16])     // load right vector from image[i+1][0..15]
    //
    // you now have a 3x3 arrangement of vectors on which
    // you can perform a neighbourhood operation and generate
    // 16 output pixels for the current iteration:
    //
    //    v_m1_m1  v_m1_0  v_m1_p1
    //    v_0_m1   v_0_0   v_0_p1
    //    v_p1_m1  v_p1_0  v_p1_p1
    //
    //               |
    //               V
    //
    //              v_out
    //
    store_vec(v_out, &image_out[i][j])      // store output vector at image_out[i][j..j+15]
    // shuffle vectors so that we can use them on next iteration
    v_m1_m1 = v_m1_0
    v_m1_0  = v_m1_p1
    v_0_m1  = v_0_0 
    v_0_0   = v_0_p1
    v_p1_m1 = v_p1_0
    v_p1_0  = v_p1_p1
  end_for
  // for final iteration we need to handle right edge pixels...
  v_m1_p1 = init_vec(p_row_m1[ncols-1])     // fill right vector from image[i-1][ncols-1]
  v_0_p1 = init_vec(p_row_0[ncols-1])       // fill right vector from image[i][ncols-1]
  v_p1_p1 = init_vec(p_row_p1[ncols-1])     // fill right vector from image[i+1][ncols-1]
  // calculate v_out as above
  store_vec(v_out, &image_out[i][j])        // store output vector at image_out[i][ncols_16..ncols-1]
end_for