Arrays 如何在arm程序集中初始化和处理数组
我想将此C程序转换为ARM NEON程序集:Arrays 如何在arm程序集中初始化和处理数组,arrays,assembly,arm,neon,Arrays,Assembly,Arm,Neon,我想将此C程序转换为ARM NEON程序集: int main() { int str1[]={1,2,3,4,5,6,7,8,9,10}; int str2[]={11,12,3,4,8,1,4,5,8,3}; int str3[10],i; for(i=0;i<10;i++) { str3[i] = str1[i]+str2[i]; } } intmain() { int str1[]={1,2,3,4,5,6,7,8
int main()
{
int str1[]={1,2,3,4,5,6,7,8,9,10};
int str2[]={11,12,3,4,8,1,4,5,8,3};
int str3[10],i;
for(i=0;i<10;i++)
{
str3[i] = str1[i]+str2[i];
}
}
intmain()
{
int str1[]={1,2,3,4,5,6,7,8,9,10};
int str2[]={11,12,3,4,8,1,4,5,8,3};
int str3[10],i;
for(i=0;i10)对于SIMD优化不是一个非常有用的大小,但为了便于论证,您可以使用64位NEON内部函数执行类似操作:
#include <arm_neon.h>
int main()
{
int str1[]={1,2,3,4,5,6,7,8,9,10};
int str2[]={11,12,3,4,8,1,4,5,8,3};
int str3[10];
int i;
for (i = 0; i < 10; i += 2)
{
int32x2t v1, v2, v3;
v1 = vld1_s32(&str1[i]);
v2 = vld1_s32(&str2[i]);
v3 = vadd_s32(v1, v2);
vst1_s32(&str3[i], v3);
}
return 0;
}
#包括
int main()
{
int str1[]={1,2,3,4,5,6,7,8,9,10};
int str2[]={11,12,3,4,8,1,4,5,8,3};
int str3[10];
int i;
对于(i=0;i<10;i+=2)
{
int32x2t v1、v2、v3;
v1=vld1_s32(&str1[i]);
v2=vld1_s32(&str2[i]);
v3=vadd_s32(v1,v2);
vst1_s32(&str3[i],v3);
}
返回0;
}
让我们在gcc中作弊并使用
,看看gcc生成了什么代码(neon add.c):
您可以获得统一语法的ARM assembly:
.syntax unified
.arch armv7-a
.eabi_attribute 27, 3
.eabi_attribute 28, 1
.fpu neon
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 4
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.thumb
.file "neon-add.c"
.text
.align 1
.thumb
.thumb_func
.type print, %function
print:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r3, r4, r5, lr}
mov r5, r0
movs r4, #0
.L2:
mov r1, r4
ldr r2, [r5, r4, lsl #2]
ldr r0, .L4
adds r4, r4, #1
bl printf
cmp r4, #10
bne .L2
pop {r3, r4, r5, pc}
.L5:
.align 2
.L4:
.word .LC2
.size print, .-print
.section .text.startup,"ax",%progbits
.align 1
.global main
.thumb
.thumb_func
.type main, %function
main:
@ args = 0, pretend = 0, frame = 120
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, r5, lr}
sub sp, sp, #124
ldr r4, .L9
mov r5, sp
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
ldmia r4, {r0, r1}
adds r4, r4, #8
stmia r5, {r0, r1}
add r5, sp, #40
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
movs r3, #0
ldmia r4, {r0, r1}
stmia r5, {r0, r1}
b .L7
.L8:
vld1.32 {d16}, [r2]
vld1.32 {d17}, [r1]
vadd.i32 d16, d17, d16
vst1.32 {d16}, [r0]
.L7:
add r4, sp, #40
add r2, sp, #80
adds r1, r3, r4
add r4, sp, #0
adds r0, r3, r2
adds r2, r3, r4
adds r3, r3, #8
cmp r3, #48
bne .L8
add r0, sp, #80
bl print
movs r0, #0
add sp, sp, #124
pop {r4, r5, pc}
.L10:
.align 2
.L9:
.word .LANCHOR0
.size main, .-main
.section .rodata
.align 2
.LANCHOR0 = . + 0
.LC0:
.word 1
.word 2
.word 3
.word 4
.word 5
.word 6
.word 7
.word 8
.word 9
.word 10
.LC1:
.word 11
.word 12
.word 3
.word 4
.word 8
.word 1
.word 4
.word 5
.word 8
.word 3
.section .rodata.str1.1,"aMS",%progbits,1
.LC2:
.ascii "%d: %d\012\000"
.ident "GCC: (crosstool-NG linaro-1.13.1-4.7-2013.02-01-20130221 - Linaro GCC 2013.02) 4.7.3 20130205 (prerelease)"
.section .note.GNU-stack,"",%progbits
你可以这样做
在函数中声明数组
.align
.STR1:
.单词1,2,3,4,5,6,7,8,9,10
以后像这样使用阵列:
LDR R0,=.STR1 @loading the start address of the array into R0
非常感谢paul先生…非常感谢。这是内在正确的…明白了…是的,不需要编写原始ARM汇编程序,至少在开始时-内在函数直接映射到NEON指令,并且您可以得到编译器为您完成大部分内务管理的好处。内在函数比汇编需要更多的执行时间周期…这就是它是。。thanks@jacob:不,这通常不是真的-正如我上面所说的:每个内在函数都直接映射到一条NEON指令。如果编译器不是很好,使用手工编码的汇编程序有时可以做得稍微好一些,但可能的轻微额外好处很少值得花费大量的额外时间和精力。@artless:好的,我只有自己的经验,尽管早期版本的gcc支持霓虹灯,但并没有生成特别好的代码(通常我可以通过手工编写asm进一步提高2倍),我仍然能够通过使用intrinsic编写的精心编码的例程获得实质性的性能改进。我有一段时间没有重新讨论过这一点,但我希望gcc在这方面有所改进,因为它与其他SIMD体系结构(特别是AltiVec和SSE)经历了类似的演变非常感谢Scott先生,让我看一下程序。如果在L2循环开始之前有关于程序部分的描述会更好,无论如何谢谢你付出了这么多的努力。你应该表现出一些努力。
.syntax unified
.arch armv7-a
.eabi_attribute 27, 3
.eabi_attribute 28, 1
.fpu neon
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 4
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.thumb
.file "neon-add.c"
.text
.align 1
.thumb
.thumb_func
.type print, %function
print:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r3, r4, r5, lr}
mov r5, r0
movs r4, #0
.L2:
mov r1, r4
ldr r2, [r5, r4, lsl #2]
ldr r0, .L4
adds r4, r4, #1
bl printf
cmp r4, #10
bne .L2
pop {r3, r4, r5, pc}
.L5:
.align 2
.L4:
.word .LC2
.size print, .-print
.section .text.startup,"ax",%progbits
.align 1
.global main
.thumb
.thumb_func
.type main, %function
main:
@ args = 0, pretend = 0, frame = 120
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, r5, lr}
sub sp, sp, #124
ldr r4, .L9
mov r5, sp
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
ldmia r4, {r0, r1}
adds r4, r4, #8
stmia r5, {r0, r1}
add r5, sp, #40
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
ldmia r4!, {r0, r1, r2, r3}
stmia r5!, {r0, r1, r2, r3}
movs r3, #0
ldmia r4, {r0, r1}
stmia r5, {r0, r1}
b .L7
.L8:
vld1.32 {d16}, [r2]
vld1.32 {d17}, [r1]
vadd.i32 d16, d17, d16
vst1.32 {d16}, [r0]
.L7:
add r4, sp, #40
add r2, sp, #80
adds r1, r3, r4
add r4, sp, #0
adds r0, r3, r2
adds r2, r3, r4
adds r3, r3, #8
cmp r3, #48
bne .L8
add r0, sp, #80
bl print
movs r0, #0
add sp, sp, #124
pop {r4, r5, pc}
.L10:
.align 2
.L9:
.word .LANCHOR0
.size main, .-main
.section .rodata
.align 2
.LANCHOR0 = . + 0
.LC0:
.word 1
.word 2
.word 3
.word 4
.word 5
.word 6
.word 7
.word 8
.word 9
.word 10
.LC1:
.word 11
.word 12
.word 3
.word 4
.word 8
.word 1
.word 4
.word 5
.word 8
.word 3
.section .rodata.str1.1,"aMS",%progbits,1
.LC2:
.ascii "%d: %d\012\000"
.ident "GCC: (crosstool-NG linaro-1.13.1-4.7-2013.02-01-20130221 - Linaro GCC 2013.02) 4.7.3 20130205 (prerelease)"
.section .note.GNU-stack,"",%progbits
LDR R0,=.STR1 @loading the start address of the array into R0