使用clang从C代码生成SIMD代码

使用clang从C代码生成SIMD代码,c,clang,llvm,vectorization,simd,C,Clang,Llvm,Vectorization,Simd,我试图从一个简单的c程序中获取SIMD代码: #include <stdio.h> const int N=20000; int main() { // input int a[N], b[N]; for(int i=0; i<N; i++){ a[i]= i %500; } for(int i=0; i<N; i++){ b[i]= i %200; }

我试图从一个简单的
c
程序中获取SIMD代码:

#include <stdio.h>

const int N=20000;

int main()
{
    // input 
    int a[N], b[N]; 
    for(int i=0; i<N; i++){
        a[i]= i %500;
    }
    
    for(int i=0; i<N; i++){
        b[i]= i %200;
    }
    
    
    // output 
    int c[N]; 

    for(int i=0;i<N;i++) 
    { 
         c[i]=a[i]+b[i]; 
    } 

    for(int i=0;i<N;i++) 
    { 
        printf("%d\n",c[i]);
    }  

    return 0;
}
现在,我使用命令行标志
-force vector width
设置向量化SIMD宽度,并生成汇编代码:

clang -S  -fno-vectorize  sum_vec.c -o sum_scalar.s
clang -S  -mllvm -force-vector-width=8  sum_vec.c -o sum_simd.s
然而,生成的代码都是标量的。如何生成SIMD代码

你能发布你的代码吗穆伊洛

使用
gcc
8.3.1和
cc-O3-S-o gvec.S-fverbose asm fix1.c
[我将您的
const int
更改为
enum
]:

    .file   "fix1.c"
# GNU C17 (GCC) version 8.3.1 20190223 (Red Hat 8.3.1-2) (x86_64-redhat-linux)
#   compiled by GNU C version 8.3.1 20190223 (Red Hat 8.3.1-2), GMP version 6.1.2, MPFR version 3.1.6-p2, MPC version 1.1.0, isl version none
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed:  fix1.c -mtune=generic -march=x86-64
# -auxbase-strip gvec.s -O3 -fverbose-asm
# options enabled:  -faggressive-loop-optimizations -falign-labels
# -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg
# -fcaller-saves -fchkp-check-incomplete-type -fchkp-check-read
# -fchkp-check-write -fchkp-instrument-calls -fchkp-narrow-bounds
# -fchkp-optimize -fchkp-store-bounds -fchkp-use-static-bounds
# -fchkp-use-static-const-bounds -fchkp-use-wrappers -fcode-hoisting
# -fcombine-stack-adjustments -fcommon -fcompare-elim -fcprop-registers
# -fcrossjumping -fcse-follow-jumps -fdefer-pop
# -fdelete-null-pointer-checks -fdevirtualize -fdevirtualize-speculatively
# -fdwarf2-cfi-asm -fearly-inlining -feliminate-unused-debug-types
# -fexpensive-optimizations -fforward-propagate -ffp-int-builtin-inexact
# -ffunction-cse -fgcse -fgcse-after-reload -fgcse-lm -fgnu-runtime
# -fgnu-unique -fguess-branch-probability -fhoist-adjacent-loads -fident
# -fif-conversion -fif-conversion2 -findirect-inlining -finline
# -finline-atomics -finline-functions -finline-functions-called-once
# -finline-small-functions -fipa-bit-cp -fipa-cp -fipa-cp-clone -fipa-icf
# -fipa-icf-functions -fipa-icf-variables -fipa-profile -fipa-pure-const
# -fipa-ra -fipa-reference -fipa-sra -fipa-vrp -fira-hoist-pressure
# -fira-share-save-slots -fira-share-spill-slots
# -fisolate-erroneous-paths-dereference -fivopts -fkeep-static-consts
# -fleading-underscore -flifetime-dse -floop-interchange
# -floop-unroll-and-jam -flra-remat -flto-odr-type-merging -fmath-errno
# -fmerge-constants -fmerge-debug-strings -fmove-loop-invariants
# -fomit-frame-pointer -foptimize-sibling-calls -foptimize-strlen
# -fpartial-inlining -fpeel-loops -fpeephole -fpeephole2 -fplt
# -fpredictive-commoning -fprefetch-loop-arrays -free -freg-struct-return
# -freorder-blocks -freorder-blocks-and-partition -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-fusion
# -fschedule-insns2 -fsemantic-interposition -fshow-column -fshrink-wrap
# -fshrink-wrap-separate -fsigned-zeros -fsplit-ivs-in-unroller
# -fsplit-loops -fsplit-paths -fsplit-wide-types -fssa-backprop
# -fssa-phiopt -fstdarg-opt -fstore-merging -fstrict-aliasing
# -fstrict-volatile-bitfields -fsync-libcalls -fthread-jumps
# -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce
# -ftree-ccp -ftree-ch -ftree-coalesce-vars -ftree-copy-prop -ftree-cselim
# -ftree-dce -ftree-dominator-opts -ftree-dse -ftree-forwprop -ftree-fre
# -ftree-loop-distribute-patterns -ftree-loop-distribution
# -ftree-loop-if-convert -ftree-loop-im -ftree-loop-ivcanon
# -ftree-loop-optimize -ftree-loop-vectorize -ftree-parallelize-loops=
# -ftree-partial-pre -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc
# -ftree-scev-cprop -ftree-sink -ftree-slp-vectorize -ftree-slsr -ftree-sra
# -ftree-switch-conversion -ftree-tail-merge -ftree-ter -ftree-vrp
# -funit-at-a-time -funswitch-loops -funwind-tables -fverbose-asm
# -fzero-initialized-in-bss -m128bit-long-double -m64 -m80387
# -malign-stringops -mavx256-split-unaligned-load
# -mavx256-split-unaligned-store -mfancy-math-387 -mfp-ret-in-387 -mfxsr
# -mglibc -mieee-fp -mlong-double-80 -mmmx -mno-sse4 -mpush-args -mred-zone
# -msse -msse2 -mstv -mtls-direct-seg-refs -mvzeroupper

    .text
    .section    .rodata.str1.1,"aMS",@progbits,1
.LC4:
    .string "%d\n"
    .section    .text.startup,"ax",@progbits
    .p2align 4,,15
    .globl  main
    .type   main, @function
main:
.LFB11:
    .cfi_startproc
    pushq   %rbp    #
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
# fix1.c:12:        a[i] = i % 500;
    pxor    %xmm6, %xmm6    # tmp120
# fix1.c:7: {
    pushq   %rbx    #
    .cfi_def_cfa_offset 24
    .cfi_offset 3, -24
# fix1.c:12:        a[i] = i % 500;
    movdqa  %xmm6, %xmm7    # tmp120, tmp124
# fix1.c:7: {
    subq    $240024, %rsp   #,
    .cfi_def_cfa_offset 240048
    movdqa  .LC2(%rip), %xmm5   #, tmp200
# fix1.c:7: {
    movdqa  .LC0(%rip), %xmm3   #, vect_vec_iv_.9
    movdqa  .LC1(%rip), %xmm2   #, tmp199
    leaq    16(%rsp), %rax  #, ivtmp.49
    leaq    80016(%rsp), %rdx   #, _47
# fix1.c:12:        a[i] = i % 500;
    pcmpgtd %xmm5, %xmm7    # tmp200, tmp124
# fix1.c:7: {
    movdqa  %xmm3, %xmm4    # vect_vec_iv_.9, vect_vec_iv_.16
    .p2align 4,,10
    .p2align 3
.L2:
# fix1.c:12:        a[i] = i % 500;
    movdqa  %xmm4, %xmm1    # vect_vec_iv_.16, tmp117
    movdqa  %xmm6, %xmm0    # tmp120, tmp121
    movdqa  %xmm7, %xmm9    # tmp124, tmp126
    addq    $16, %rax   #, ivtmp.49
    punpckldq   %xmm4, %xmm1    # vect_vec_iv_.16, tmp117
    pcmpgtd %xmm1, %xmm0    # tmp117, tmp121
    pmuludq %xmm1, %xmm9    # tmp117, tmp126
    movdqa  %xmm0, %xmm8    # tmp121, tmp125
    movdqa  %xmm1, %xmm0    # tmp117, tmp127
    movdqa  %xmm4, %xmm1    # vect_vec_iv_.16, tmp130
    pmuludq %xmm5, %xmm8    # tmp200, tmp125
    pmuludq %xmm5, %xmm0    # tmp200, tmp127
    punpckhdq   %xmm4, %xmm1    # vect_vec_iv_.16, tmp130
    paddq   %xmm9, %xmm8    # tmp126, tmp125
    movdqa  %xmm7, %xmm9    # tmp124, tmp139
    psllq   $32, %xmm8  #, tmp125
    pmuludq %xmm1, %xmm9    # tmp130, tmp139
    paddq   %xmm8, %xmm0    # tmp125, tmp115
    movdqa  %xmm6, %xmm8    # tmp120, tmp134
    pcmpgtd %xmm1, %xmm8    # tmp130, tmp134
    pmuludq %xmm5, %xmm1    # tmp200, tmp140
    pmuludq %xmm5, %xmm8    # tmp200, tmp138
    paddq   %xmm9, %xmm8    # tmp139, tmp138
    psllq   $32, %xmm8  #, tmp138
    paddq   %xmm8, %xmm1    # tmp138, tmp128
    shufps  $221, %xmm1, %xmm0  #, tmp128, vect_patt_65.17
    psrad   $5, %xmm0   #, vect_patt_66.18
    movdqa  %xmm0, %xmm1    # vect_patt_66.18, tmp146
    pslld   $5, %xmm1   #, tmp146
    psubd   %xmm0, %xmm1    # vect_patt_66.18, tmp147
    pslld   $2, %xmm1   #, tmp148
    paddd   %xmm1, %xmm0    # tmp148, vect_patt_67.19
    movdqa  %xmm4, %xmm1    # vect_vec_iv_.16, vect_patt_68.20
    paddd   %xmm2, %xmm4    # tmp199, vect_vec_iv_.16
    pslld   $2, %xmm0   #, tmp150
    psubd   %xmm0, %xmm1    # tmp150, vect_patt_68.20
    movaps  %xmm1, -16(%rax)    # vect_patt_68.20, MEM[base: _49, offset: 0B]
    cmpq    %rdx, %rax  # _47, ivtmp.49
    jne .L2 #,
    movdqa  .LC3(%rip), %xmm4   #, tmp201
# fix1.c:16:        b[i] = i % 200;
    pxor    %xmm5, %xmm5    # tmp158
    leaq    80016(%rsp), %rax   #, tmp214
    movdqa  %xmm5, %xmm6    # tmp158, tmp162
    leaq    80000(%rax), %rdx   #, _4
    pcmpgtd %xmm4, %xmm6    # tmp201, tmp162
    .p2align 4,,10
    .p2align 3
.L3:
# fix1.c:16:        b[i] = i % 200;
    movdqa  %xmm3, %xmm1    # vect_vec_iv_.9, tmp155
    movdqa  %xmm5, %xmm0    # tmp158, tmp159
    movdqa  %xmm6, %xmm8    # tmp162, tmp164
    addq    $16, %rax   #, ivtmp.43
    punpckldq   %xmm3, %xmm1    # vect_vec_iv_.9, tmp155
    pcmpgtd %xmm1, %xmm0    # tmp155, tmp159
    pmuludq %xmm1, %xmm8    # tmp155, tmp164
    movdqa  %xmm0, %xmm7    # tmp159, tmp163
    movdqa  %xmm1, %xmm0    # tmp155, tmp165
    movdqa  %xmm3, %xmm1    # vect_vec_iv_.9, tmp168
    pmuludq %xmm4, %xmm7    # tmp201, tmp163
    pmuludq %xmm4, %xmm0    # tmp201, tmp165
    punpckhdq   %xmm3, %xmm1    # vect_vec_iv_.9, tmp168
    paddq   %xmm8, %xmm7    # tmp164, tmp163
    movdqa  %xmm6, %xmm8    # tmp162, tmp177
    psllq   $32, %xmm7  #, tmp163
    pmuludq %xmm1, %xmm8    # tmp168, tmp177
    paddq   %xmm7, %xmm0    # tmp163, tmp153
    movdqa  %xmm5, %xmm7    # tmp158, tmp172
    pcmpgtd %xmm1, %xmm7    # tmp168, tmp172
    pmuludq %xmm4, %xmm1    # tmp201, tmp178
    pmuludq %xmm4, %xmm7    # tmp201, tmp176
    paddq   %xmm8, %xmm7    # tmp177, tmp176
    psllq   $32, %xmm7  #, tmp176
    paddq   %xmm7, %xmm1    # tmp176, tmp166
    movdqa  %xmm3, %xmm7    # vect_vec_iv_.9, vect_patt_50.13
    paddd   %xmm2, %xmm3    # tmp199, vect_vec_iv_.9
    shufps  $221, %xmm1, %xmm0  #, tmp166, vect_patt_47.10
    psrad   $6, %xmm0   #, vect_patt_48.11
    movdqa  %xmm0, %xmm1    # vect_patt_48.11, tmp184
    pslld   $1, %xmm1   #, tmp184
    paddd   %xmm0, %xmm1    # vect_patt_48.11, tmp185
    pslld   $3, %xmm1   #, tmp186
    paddd   %xmm1, %xmm0    # tmp186, vect_patt_49.12
    pslld   $3, %xmm0   #, tmp188
    psubd   %xmm0, %xmm7    # tmp188, vect_patt_50.13
    movaps  %xmm7, -16(%rax)    # vect_patt_50.13, MEM[base: _10, offset: 0B]
    cmpq    %rdx, %rax  # _4, ivtmp.43
    jne .L3 #,
    xorl    %eax, %eax  # ivtmp.34
    .p2align 4,,10
    .p2align 3
.L4:
# fix1.c:23:        c[i] = a[i] + b[i];
    movdqa  80016(%rsp,%rax), %xmm0 # MEM[symbol: b, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
    movdqa  16(%rsp,%rax), %xmm2    # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
    paddd   %xmm2, %xmm0    # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
    movaps  %xmm2, (%rsp)   # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], %sfp
# fix1.c:23:        c[i] = a[i] + b[i];
    movaps  %xmm0, 160016(%rsp,%rax)    # vect__5.6, MEM[symbol: c, index: ivtmp.34_1, offset: 0B]
    addq    $16, %rax   #, ivtmp.34
    cmpq    $80000, %rax    #, ivtmp.34
    jne .L4 #,
    leaq    160016(%rsp), %rbx  #, tmp229
    leaq    240016(%rsp), %rbp  #, _39
    .p2align 4,,10
    .p2align 3
.L5:
# fix1.c:27:        printf("%d\n", c[i]);
    movl    (%rbx), %esi    # MEM[base: _40, offset: 0B],
    movl    $.LC4, %edi #,
    xorl    %eax, %eax  #
    addq    $4, %rbx    #, ivtmp.29
    call    printf  #
# fix1.c:26:    for (int i = 0; i < N; i++) {
    cmpq    %rbx, %rbp  # ivtmp.29, _39
    jne .L5 #,
# fix1.c:31: }
    addq    $240024, %rsp   #,
    .cfi_def_cfa_offset 24
    xorl    %eax, %eax  #
    popq    %rbx    #
    .cfi_def_cfa_offset 16
    popq    %rbp    #
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE11:
    .size   main, .-main
    .section    .rodata.cst16,"aM",@progbits,16
    .align 16
.LC0:
    .long   0
    .long   1
    .long   2
    .long   3
    .align 16
.LC1:
    .long   4
    .long   4
    .long   4
    .long   4
    .align 16
.LC2:
    .long   274877907
    .long   274877907
    .long   274877907
    .long   274877907
    .align 16
.LC3:
    .long   1374389535
    .long   1374389535
    .long   1374389535
    .long   1374389535
    .ident  "GCC: (GNU) 8.3.1 20190223 (Red Hat 8.3.1-2)"
    .section    .note.GNU-stack,"",@progbits
。文件“fix1.c”
#GNU C17(GCC)版本8.3.1 20190223(Red Hat 8.3.1-2)(x86_64-redhat-linux)
#由GNU C版本8.3.1 20190223(Red Hat 8.3.1-2)、GMP版本6.1.2、MPFR版本3.1.6-p2、MPC版本1.1.0、isl版本无编译
#GGC启发式:--param GGC min expand=100--param GGC min heapsize=131072
#传递的选项:fix1.c-mtune=generic-march=x86-64
#-auxbase条带gvec.s-O3-fverbose asm
#选项已启用:-faggressive循环优化-Falging标签
#-fasynchronous unwind tables-fauto inc dec-fbranch count reg
#-fcaller保存-fchkp检查不完整类型-fchkp检查读取
#-fchkp检查写入-fchkp仪表调用-fchkp窄边界
#-fchkp优化-fchkp存储边界-fchkp使用静态边界
#-fchkp使用静态常数边界-fchkp使用包装器-fcode提升
#-fcombine堆栈调整-fcommon-fcompare elim-fcprop寄存器
#-fcrosssjumping-fcse跟随跳跃-fdefer弹出
#-fdelete空指针检查-fdevirtualize-fdevirtualize
#-fdwarf2 cfi asm-可怕的内联-适合未使用的调试类型
#-F开销优化-fforward传播-ffp int内置不精确
#-ffunction cse-fgcse-fgcse重新加载后-fgcse lm-fgnu运行时
#-fgnu unique-fguess分支概率-fhoist相邻荷载-fident
#-fif转换-fif-conversion2-findirect内联-finline
#-finline原子-finline函数-finline函数调用一次
#-finline小功能-fipa位cp-fipa cp-fipa cp克隆-fipa icf
#-fipa icf函数-fipa icf变量-fipa配置文件-fipa纯常量
#-fipa ra-fipa参考-fipa sra-fipa vrp-fira提升压力
#-fira共享保存插槽-fira共享溢出插槽
#-FISOLE错误路径解引用-fivopts-fkeep静态常量
#-跳蚤下划线-flifetime dse-floop交换
#-地板展开和堵塞-flra remat-flto odr类型合并-fmath errno
#-fmerge常量-fmerge调试字符串-fmove循环不变量
#-fomit帧指针-fooptimize同级调用-fooptimize strlen
#-FP部分内衬-fpeel环路-FPEEPPHOLE-FPEEPPHOLE2-fplt
#-fppredictive commoning-fprefetch循环数组-free-freg结构返回
#-freorder块-freorder块和分区-freorder函数
#-frerun cse after loop-fsched关键路径启发式
#-fsched dep count启发式-fsched group启发式-fsched interblock
#-fsched last insn启发式-fsched rank启发式-fsched spec
#-fsched spec insn启发式-fsched stalled insn dep-fschedule fusion
#-fschedule-insns2-fsemmantic interposition-fshow column-fshrink wrap
#-fshrink-wrap-separate-fsigned zero-fsplit-ivs在展开器中
#-fsplit循环-fsplit路径-fsplit宽类型-fssa backprop
#-fssa phiopt-fstdarg opt-fstore合并-fstrict别名
#-fstrict volatile bitfields-fsync libcalls-fthread jumps
#-ftoplevel reorder-ftrapping math-ftree bit ccp-ftree内置调用dce
#-ftree ccp-ftree ch-ftree合并变量-ftree复制属性-ftree cselim
#-ftree dce-ftree支配者选项-ftree dse-ftree forwprop-ftree fre
#-ftree循环分布模式-ftree循环分布
#-ftree-loop if-convert-ftree-loop im-ftree-loop ivcanon
#-ftree循环优化-ftree循环矢量化-ftree并行化循环=
#-ftree部分预ftree phiprop-ftree预ftree pta-ftree重新评估
#-ftree scev cprop-ftree接收器-ftree slp矢量化-ftree slsr-ftree sra
#-ftree开关转换-ftree尾部合并-ftree ter-ftree vrp
#-funit-at-a-time-funswitch循环-funwind表格-fverbose asm
#-fzero在bss中初始化-m128位长双精度-m64-m80387
#-恶意stringops-mavx256拆分未对齐负载
#-mavx256拆分未对齐存储区-MFANCE-math-387-mfp-ret-in-387-mfxsr
#-mglibc-mieee fp-mlong-double-80-mmmx-mno-sse4-mpush args-mred区域
#-msse-msse2-mstv-mtls直接分段参考-MVE
.文本
.section.rodata.str1.1,“aMS”@progbits,1
.LC4:
.string“%d\n”
.section.text.startup,“ax”、@progbits
.P24,,15
格洛博梅因酒店
.type main,@函数
主要内容:
.LFB11:
.cfi_startproc
pushq%rbp#
.cfi_def_cfa_偏移量16
.cfi_偏移量6,-16
#fix1.c:12:a[i]=i%500;
pxor%xmm6,%xmm6#tmp120
#修正案1.c:7:{
pushq%rbx#
.cfi_def_cfa_偏移量24
.cfi_偏移量3,-24
#fix1.c:12:a[i]=i%500;
movdqa%xmm6、%xmm7#tmp120、tmp124
#修正案1.c:7:{
低于240024美元,rsp#,
.cfi_def_cfa_偏移量240048
movdqa.LC2(%rip),%xmm5#,tmp200
#修正案1.c:7:{
movdqa.LC0(%rip),%xmm3,向量向量iv.9
movdqa.LC1(%rip),%xmm2#,tmp199
leaq 16(%rsp),%rax#,ivtmp.49
leaq 80016(%rsp),%rdx,\U 47
#fix1.c:12:a[i]=i%500;
pcmpgtd%xmm5%xmm7#tmp200,tmp124
#修正案1.c:7:{
移动质量保证%xmm3,%xmm4向量向量iv.9,向量iv.16
.P24,,10
.P23
.L2:
#fix1.c:12:a[i]=i%500;
movdqa%xmm4%xmm1向量iv.16,tmp117
movdqa%xmm6、%xmm0#tmp120、tmp121
movdqa%xmm7、%xmm9#tmp124、tmp126
addq$16,%rax#,ivtmp.49
punpckldq%xmm4%xmm1向量iv.16,tmp117
pcmpgtd%xmm1,%xmm0#tmp117,tmp121
pmuludq%xmm1%xmm9#tmp117,tmp126
movdqa%xmm0%xmm8#tmp121、tmp125
movdqa%xmm1,%xmm0#tmp117,tmp127
movdqa%xmm4%xmm1向量iv.16,tmp130
pmuludq%xmm5%xmm8#tmp200,tmp125
pmuludq%xmm5%xmm0#tmp200,tmp127