将ARM 32位neon转换为ARM 64位neon_Arm_64 Bit_Neon

将ARM 32位neon转换为ARM 64位neon

arm

将ARM 32位neon转换为ARM 64位neon,arm,64-bit,neon,Arm,64 Bit,Neon,我有以下32位neon代码，可以简单地提取图像： extractY8ImageARM(unsigned char *from, unsigned char *to, int left, int top, int width, int height, int stride) from: pointer to the original image to: pointer to the destination extracted image left, top: position where to e

我有以下32位neon代码，可以简单地提取图像：

extractY8ImageARM(unsigned char *from, unsigned char *to, int left, int top, int width, int height, int stride)
from: pointer to the original image
to: pointer to the destination extracted image
left, top: position where to extract in the original image
width, height: size of the extracted image
stride: width of the original image

下面是汇编代码：

.text
.arch armv7-a
.fpu neon
.type extractY8ImageARM, STT_FUNC
.global extractY8ImageARM

extractY8ImageARM:
from    .req r0
to  .req r1
left    .req r2
top .req r3
width   .req r4
height  .req r5
stride  .req r6
tmp .req r7

    push {r0-r7, lr}

//Let's get back the arguments
    ldr width, [sp, #(9 * 4)]
    ldr height, [sp, #(10 * 4)]
    ldr stride, [sp, #(11 * 4)]

//Update the from pointer. Advance left + stride * top
    add from, from, left
    mul tmp, top, stride
    add from, from, tmp

.loopV:
//We will copy width
    mov tmp, width

.loopH:
//Read and store data
    pld [from]
    vld1.u8 { d0, d1, d2, d3 }, [from]!

    pld [to]
    vst1.u8 { d0, d1, d2, d3 }, [to]!

    subs tmp, tmp, #32
    bgt .loopH

//We advance the from pointer for the next line
    add from, from, stride
    sub from, from, width

    subs height, height, #1
    bgt .loopV


    pop {r0-r7, pc}

.unreq from
.unreq to
.unreq left
.unreq top
.unreq width
.unreq height
.unreq stride
.unreq tmp

我需要将它移植到64位neon。谁能帮我翻译一下吗？我已经阅读了这份白皮书，所以我或多或少地理解了其中的差异

我的代码很简单，这将是一个如何在64位neon程序集文件中传递参数和加载/存储数据的好例子。我更喜欢避免内在的。

整个代码如下所示：

.text
.arch armv8-a
.type extractY8ImageARM, STT_FUNC
.global extractY8ImageARM

extractY8ImageARM:
from    .req x0
to  .req x1
left    .req x2
top .req x3
width   .req x4
height  .req x5
stride  .req x6
tmp .req x9

    add from, from, left
    mul tmp, top, stride
    add from, from, tmp

.loopV:
    mov tmp, width

.loopH:
    ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [from], #64

    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [to], #64

    subs tmp, tmp, #64
    bgt .loopH

    add from, from, stride
    sub from, from, width

    subs height, height, #1
    bgt .loopV

    ret


.unreq from
.unreq to
.unreq left
.unreq top
.unreq width
.unreq height
.unreq stride
.unreq tmp

请看我的回答：使用内部语言确实是个好主意。你的NEON代码不是很优化，如果你使用Intrinsic，它将可移植到ARM32和ARM64。如果你坚持编写ARM64汇编语言，你可以在这里看到我的github项目的完整示例：即使使用Intrinsic，NEON-32位到NEON-64位之间的更改也太多。我得到了ld1/st1指令。谢谢那很有用。我不确定如何翻译push/pop。了解为什么32位neon代码没有优化会很有趣。这只是一个带有vld1和vst1的循环。你能做更多或不同的事情吗？请注意，宽度需要可以被16整除，而不是32整除。