Arrays 如何在arm程序集中初始化和处理数组

Arrays 如何在arm程序集中初始化和处理数组,arrays,assembly,arm,neon,Arrays,Assembly,Arm,Neon,我想将此C程序转换为ARM NEON程序集: int main() { int str1[]={1,2,3,4,5,6,7,8,9,10}; int str2[]={11,12,3,4,8,1,4,5,8,3}; int str3[10],i; for(i=0;i<10;i++) { str3[i] = str1[i]+str2[i]; } } intmain() { int str1[]={1,2,3,4,5,6,7,8

我想将此C程序转换为ARM NEON程序集:

int main()
{
    int str1[]={1,2,3,4,5,6,7,8,9,10};
    int str2[]={11,12,3,4,8,1,4,5,8,3};
    int str3[10],i;

    for(i=0;i<10;i++)
    {
        str3[i] = str1[i]+str2[i];
    }
}
intmain()
{
int str1[]={1,2,3,4,5,6,7,8,9,10};
int str2[]={11,12,3,4,8,1,4,5,8,3};
int str3[10],i;

for(i=0;i10)对于SIMD优化不是一个非常有用的大小,但为了便于论证,您可以使用64位NEON内部函数执行类似操作:

#include <arm_neon.h>

int main()
{
    int str1[]={1,2,3,4,5,6,7,8,9,10};
    int str2[]={11,12,3,4,8,1,4,5,8,3};
    int str3[10];
    int i;

    for (i = 0; i < 10; i += 2)
    {
        int32x2t v1, v2, v3;

        v1 = vld1_s32(&str1[i]);
        v2 = vld1_s32(&str2[i]);
        v3 = vadd_s32(v1, v2);
        vst1_s32(&str3[i], v3);
    }

    return 0;
}
#包括
int main()
{
int str1[]={1,2,3,4,5,6,7,8,9,10};
int str2[]={11,12,3,4,8,1,4,5,8,3};
int str3[10];
int i;
对于(i=0;i<10;i+=2)
{
int32x2t v1、v2、v3;
v1=vld1_s32(&str1[i]);
v2=vld1_s32(&str2[i]);
v3=vadd_s32(v1,v2);
vst1_s32(&str3[i],v3);
}
返回0;
}
让我们在gcc中作弊并使用
,看看gcc生成了什么代码(neon add.c):

您可以获得统一语法的ARM assembly:

    .syntax unified
    .arch armv7-a
    .eabi_attribute 27, 3
    .eabi_attribute 28, 1
    .fpu neon
    .eabi_attribute 20, 1
    .eabi_attribute 21, 1
    .eabi_attribute 23, 3
    .eabi_attribute 24, 1
    .eabi_attribute 25, 1
    .eabi_attribute 26, 2
    .eabi_attribute 30, 4
    .eabi_attribute 34, 1
    .eabi_attribute 18, 4
    .thumb
    .file   "neon-add.c"
    .text
    .align  1
    .thumb
    .thumb_func
    .type   print, %function
print:
    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    push    {r3, r4, r5, lr}
    mov r5, r0
    movs    r4, #0
.L2:
    mov r1, r4
    ldr r2, [r5, r4, lsl #2]
    ldr r0, .L4
    adds    r4, r4, #1
    bl  printf
    cmp r4, #10
    bne .L2
    pop {r3, r4, r5, pc}
.L5:
    .align  2
.L4:
    .word   .LC2
    .size   print, .-print
    .section    .text.startup,"ax",%progbits
    .align  1
    .global main
    .thumb
    .thumb_func
    .type   main, %function
main:
    @ args = 0, pretend = 0, frame = 120
    @ frame_needed = 0, uses_anonymous_args = 0
    push    {r4, r5, lr}
    sub sp, sp, #124
    ldr r4, .L9
    mov r5, sp
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    ldmia   r4, {r0, r1}
    adds    r4, r4, #8
    stmia   r5, {r0, r1}
    add r5, sp, #40
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    movs    r3, #0
    ldmia   r4, {r0, r1}
    stmia   r5, {r0, r1}
    b   .L7
.L8:
    vld1.32 {d16}, [r2]
    vld1.32 {d17}, [r1]
    vadd.i32    d16, d17, d16
    vst1.32 {d16}, [r0]
.L7:
    add r4, sp, #40
    add r2, sp, #80
    adds    r1, r3, r4
    add r4, sp, #0
    adds    r0, r3, r2
    adds    r2, r3, r4
    adds    r3, r3, #8
    cmp r3, #48
    bne .L8
    add r0, sp, #80
    bl  print
    movs    r0, #0
    add sp, sp, #124
    pop {r4, r5, pc}
.L10:
    .align  2
.L9:
    .word   .LANCHOR0
    .size   main, .-main
    .section    .rodata
    .align  2
.LANCHOR0 = . + 0
.LC0:
    .word   1
    .word   2
    .word   3
    .word   4
    .word   5
    .word   6
    .word   7
    .word   8
    .word   9
    .word   10
.LC1:
    .word   11
    .word   12
    .word   3
    .word   4
    .word   8
    .word   1
    .word   4
    .word   5
    .word   8
    .word   3
    .section    .rodata.str1.1,"aMS",%progbits,1
.LC2:
    .ascii  "%d: %d\012\000"
    .ident  "GCC: (crosstool-NG linaro-1.13.1-4.7-2013.02-01-20130221 - Linaro GCC 2013.02) 4.7.3 20130205 (prerelease)"
    .section    .note.GNU-stack,"",%progbits

你可以这样做

在函数中声明数组

.align
.STR1:
.单词1,2,3,4,5,6,7,8,9,10

以后像这样使用阵列:

LDR R0,=.STR1            @loading the start address of the array into R0

非常感谢paul先生…非常感谢。这是内在正确的…明白了…是的,不需要编写原始ARM汇编程序,至少在开始时-内在函数直接映射到NEON指令,并且您可以得到编译器为您完成大部分内务管理的好处。内在函数比汇编需要更多的执行时间周期…这就是它是。。thanks@jacob:不,这通常不是真的-正如我上面所说的:每个内在函数都直接映射到一条NEON指令。如果编译器不是很好,使用手工编码的汇编程序有时可以做得稍微好一些,但可能的轻微额外好处很少值得花费大量的额外时间和精力。@artless:好的,我只有自己的经验,尽管早期版本的gcc支持霓虹灯,但并没有生成特别好的代码(通常我可以通过手工编写asm进一步提高2倍),我仍然能够通过使用intrinsic编写的精心编码的例程获得实质性的性能改进。我有一段时间没有重新讨论过这一点,但我希望gcc在这方面有所改进,因为它与其他SIMD体系结构(特别是AltiVec和SSE)经历了类似的演变非常感谢Scott先生,让我看一下程序。如果在L2循环开始之前有关于程序部分的描述会更好,无论如何谢谢你付出了这么多的努力。你应该表现出一些努力。
    .syntax unified
    .arch armv7-a
    .eabi_attribute 27, 3
    .eabi_attribute 28, 1
    .fpu neon
    .eabi_attribute 20, 1
    .eabi_attribute 21, 1
    .eabi_attribute 23, 3
    .eabi_attribute 24, 1
    .eabi_attribute 25, 1
    .eabi_attribute 26, 2
    .eabi_attribute 30, 4
    .eabi_attribute 34, 1
    .eabi_attribute 18, 4
    .thumb
    .file   "neon-add.c"
    .text
    .align  1
    .thumb
    .thumb_func
    .type   print, %function
print:
    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    push    {r3, r4, r5, lr}
    mov r5, r0
    movs    r4, #0
.L2:
    mov r1, r4
    ldr r2, [r5, r4, lsl #2]
    ldr r0, .L4
    adds    r4, r4, #1
    bl  printf
    cmp r4, #10
    bne .L2
    pop {r3, r4, r5, pc}
.L5:
    .align  2
.L4:
    .word   .LC2
    .size   print, .-print
    .section    .text.startup,"ax",%progbits
    .align  1
    .global main
    .thumb
    .thumb_func
    .type   main, %function
main:
    @ args = 0, pretend = 0, frame = 120
    @ frame_needed = 0, uses_anonymous_args = 0
    push    {r4, r5, lr}
    sub sp, sp, #124
    ldr r4, .L9
    mov r5, sp
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    ldmia   r4, {r0, r1}
    adds    r4, r4, #8
    stmia   r5, {r0, r1}
    add r5, sp, #40
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    ldmia   r4!, {r0, r1, r2, r3}
    stmia   r5!, {r0, r1, r2, r3}
    movs    r3, #0
    ldmia   r4, {r0, r1}
    stmia   r5, {r0, r1}
    b   .L7
.L8:
    vld1.32 {d16}, [r2]
    vld1.32 {d17}, [r1]
    vadd.i32    d16, d17, d16
    vst1.32 {d16}, [r0]
.L7:
    add r4, sp, #40
    add r2, sp, #80
    adds    r1, r3, r4
    add r4, sp, #0
    adds    r0, r3, r2
    adds    r2, r3, r4
    adds    r3, r3, #8
    cmp r3, #48
    bne .L8
    add r0, sp, #80
    bl  print
    movs    r0, #0
    add sp, sp, #124
    pop {r4, r5, pc}
.L10:
    .align  2
.L9:
    .word   .LANCHOR0
    .size   main, .-main
    .section    .rodata
    .align  2
.LANCHOR0 = . + 0
.LC0:
    .word   1
    .word   2
    .word   3
    .word   4
    .word   5
    .word   6
    .word   7
    .word   8
    .word   9
    .word   10
.LC1:
    .word   11
    .word   12
    .word   3
    .word   4
    .word   8
    .word   1
    .word   4
    .word   5
    .word   8
    .word   3
    .section    .rodata.str1.1,"aMS",%progbits,1
.LC2:
    .ascii  "%d: %d\012\000"
    .ident  "GCC: (crosstool-NG linaro-1.13.1-4.7-2013.02-01-20130221 - Linaro GCC 2013.02) 4.7.3 20130205 (prerelease)"
    .section    .note.GNU-stack,"",%progbits
LDR R0,=.STR1            @loading the start address of the array into R0