Search code examples
cclangllvmvectorizationsimd

Generate SIMD code from C code using clang


I am trying to get SIMD code from a simple c program:

#include <stdio.h>

const int N=20000;

int main()
{
    // input 
    int a[N], b[N]; 
    for(int i=0; i<N; i++){
        a[i]= i %500;
    }
    
    for(int i=0; i<N; i++){
        b[i]= i %200;
    }
    
    
    // output 
    int c[N]; 

    for(int i=0;i<N;i++) 
    { 
         c[i]=a[i]+b[i]; 
    } 

    for(int i=0;i<N;i++) 
    { 
        printf("%d\n",c[i]);
    }  

    return 0;
}

First I disable the loop vectorizer through clang using the command line flag and generate assembly code:

clang -S  -fno-vectorize  sum_vec.c -o sum_scalar.s

Now I set the vectorization SIMD width using the command line flag -force-vector-width and generate assembly code:

clang -S  -mllvm -force-vector-width=8  sum_vec.c -o sum_simd.s

However, generated code are both scalar. How can I generate SIMD code?


Solution

  • can you post your code? – muiloo

    With gcc 8.3.1 and cc -O3 -S -o gvec.s -fverbose-asm fix1.c [I changed your const int into an enum]:

        .file   "fix1.c"
    # GNU C17 (GCC) version 8.3.1 20190223 (Red Hat 8.3.1-2) (x86_64-redhat-linux)
    #   compiled by GNU C version 8.3.1 20190223 (Red Hat 8.3.1-2), GMP version 6.1.2, MPFR version 3.1.6-p2, MPC version 1.1.0, isl version none
    # GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
    # options passed:  fix1.c -mtune=generic -march=x86-64
    # -auxbase-strip gvec.s -O3 -fverbose-asm
    # options enabled:  -faggressive-loop-optimizations -falign-labels
    # -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg
    # -fcaller-saves -fchkp-check-incomplete-type -fchkp-check-read
    # -fchkp-check-write -fchkp-instrument-calls -fchkp-narrow-bounds
    # -fchkp-optimize -fchkp-store-bounds -fchkp-use-static-bounds
    # -fchkp-use-static-const-bounds -fchkp-use-wrappers -fcode-hoisting
    # -fcombine-stack-adjustments -fcommon -fcompare-elim -fcprop-registers
    # -fcrossjumping -fcse-follow-jumps -fdefer-pop
    # -fdelete-null-pointer-checks -fdevirtualize -fdevirtualize-speculatively
    # -fdwarf2-cfi-asm -fearly-inlining -feliminate-unused-debug-types
    # -fexpensive-optimizations -fforward-propagate -ffp-int-builtin-inexact
    # -ffunction-cse -fgcse -fgcse-after-reload -fgcse-lm -fgnu-runtime
    # -fgnu-unique -fguess-branch-probability -fhoist-adjacent-loads -fident
    # -fif-conversion -fif-conversion2 -findirect-inlining -finline
    # -finline-atomics -finline-functions -finline-functions-called-once
    # -finline-small-functions -fipa-bit-cp -fipa-cp -fipa-cp-clone -fipa-icf
    # -fipa-icf-functions -fipa-icf-variables -fipa-profile -fipa-pure-const
    # -fipa-ra -fipa-reference -fipa-sra -fipa-vrp -fira-hoist-pressure
    # -fira-share-save-slots -fira-share-spill-slots
    # -fisolate-erroneous-paths-dereference -fivopts -fkeep-static-consts
    # -fleading-underscore -flifetime-dse -floop-interchange
    # -floop-unroll-and-jam -flra-remat -flto-odr-type-merging -fmath-errno
    # -fmerge-constants -fmerge-debug-strings -fmove-loop-invariants
    # -fomit-frame-pointer -foptimize-sibling-calls -foptimize-strlen
    # -fpartial-inlining -fpeel-loops -fpeephole -fpeephole2 -fplt
    # -fpredictive-commoning -fprefetch-loop-arrays -free -freg-struct-return
    # -freorder-blocks -freorder-blocks-and-partition -freorder-functions
    # -frerun-cse-after-loop -fsched-critical-path-heuristic
    # -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
    # -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
    # -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-fusion
    # -fschedule-insns2 -fsemantic-interposition -fshow-column -fshrink-wrap
    # -fshrink-wrap-separate -fsigned-zeros -fsplit-ivs-in-unroller
    # -fsplit-loops -fsplit-paths -fsplit-wide-types -fssa-backprop
    # -fssa-phiopt -fstdarg-opt -fstore-merging -fstrict-aliasing
    # -fstrict-volatile-bitfields -fsync-libcalls -fthread-jumps
    # -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce
    # -ftree-ccp -ftree-ch -ftree-coalesce-vars -ftree-copy-prop -ftree-cselim
    # -ftree-dce -ftree-dominator-opts -ftree-dse -ftree-forwprop -ftree-fre
    # -ftree-loop-distribute-patterns -ftree-loop-distribution
    # -ftree-loop-if-convert -ftree-loop-im -ftree-loop-ivcanon
    # -ftree-loop-optimize -ftree-loop-vectorize -ftree-parallelize-loops=
    # -ftree-partial-pre -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc
    # -ftree-scev-cprop -ftree-sink -ftree-slp-vectorize -ftree-slsr -ftree-sra
    # -ftree-switch-conversion -ftree-tail-merge -ftree-ter -ftree-vrp
    # -funit-at-a-time -funswitch-loops -funwind-tables -fverbose-asm
    # -fzero-initialized-in-bss -m128bit-long-double -m64 -m80387
    # -malign-stringops -mavx256-split-unaligned-load
    # -mavx256-split-unaligned-store -mfancy-math-387 -mfp-ret-in-387 -mfxsr
    # -mglibc -mieee-fp -mlong-double-80 -mmmx -mno-sse4 -mpush-args -mred-zone
    # -msse -msse2 -mstv -mtls-direct-seg-refs -mvzeroupper
    
        .text
        .section    .rodata.str1.1,"aMS",@progbits,1
    .LC4:
        .string "%d\n"
        .section    .text.startup,"ax",@progbits
        .p2align 4,,15
        .globl  main
        .type   main, @function
    main:
    .LFB11:
        .cfi_startproc
        pushq   %rbp    #
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
    # fix1.c:12:        a[i] = i % 500;
        pxor    %xmm6, %xmm6    # tmp120
    # fix1.c:7: {
        pushq   %rbx    #
        .cfi_def_cfa_offset 24
        .cfi_offset 3, -24
    # fix1.c:12:        a[i] = i % 500;
        movdqa  %xmm6, %xmm7    # tmp120, tmp124
    # fix1.c:7: {
        subq    $240024, %rsp   #,
        .cfi_def_cfa_offset 240048
        movdqa  .LC2(%rip), %xmm5   #, tmp200
    # fix1.c:7: {
        movdqa  .LC0(%rip), %xmm3   #, vect_vec_iv_.9
        movdqa  .LC1(%rip), %xmm2   #, tmp199
        leaq    16(%rsp), %rax  #, ivtmp.49
        leaq    80016(%rsp), %rdx   #, _47
    # fix1.c:12:        a[i] = i % 500;
        pcmpgtd %xmm5, %xmm7    # tmp200, tmp124
    # fix1.c:7: {
        movdqa  %xmm3, %xmm4    # vect_vec_iv_.9, vect_vec_iv_.16
        .p2align 4,,10
        .p2align 3
    .L2:
    # fix1.c:12:        a[i] = i % 500;
        movdqa  %xmm4, %xmm1    # vect_vec_iv_.16, tmp117
        movdqa  %xmm6, %xmm0    # tmp120, tmp121
        movdqa  %xmm7, %xmm9    # tmp124, tmp126
        addq    $16, %rax   #, ivtmp.49
        punpckldq   %xmm4, %xmm1    # vect_vec_iv_.16, tmp117
        pcmpgtd %xmm1, %xmm0    # tmp117, tmp121
        pmuludq %xmm1, %xmm9    # tmp117, tmp126
        movdqa  %xmm0, %xmm8    # tmp121, tmp125
        movdqa  %xmm1, %xmm0    # tmp117, tmp127
        movdqa  %xmm4, %xmm1    # vect_vec_iv_.16, tmp130
        pmuludq %xmm5, %xmm8    # tmp200, tmp125
        pmuludq %xmm5, %xmm0    # tmp200, tmp127
        punpckhdq   %xmm4, %xmm1    # vect_vec_iv_.16, tmp130
        paddq   %xmm9, %xmm8    # tmp126, tmp125
        movdqa  %xmm7, %xmm9    # tmp124, tmp139
        psllq   $32, %xmm8  #, tmp125
        pmuludq %xmm1, %xmm9    # tmp130, tmp139
        paddq   %xmm8, %xmm0    # tmp125, tmp115
        movdqa  %xmm6, %xmm8    # tmp120, tmp134
        pcmpgtd %xmm1, %xmm8    # tmp130, tmp134
        pmuludq %xmm5, %xmm1    # tmp200, tmp140
        pmuludq %xmm5, %xmm8    # tmp200, tmp138
        paddq   %xmm9, %xmm8    # tmp139, tmp138
        psllq   $32, %xmm8  #, tmp138
        paddq   %xmm8, %xmm1    # tmp138, tmp128
        shufps  $221, %xmm1, %xmm0  #, tmp128, vect_patt_65.17
        psrad   $5, %xmm0   #, vect_patt_66.18
        movdqa  %xmm0, %xmm1    # vect_patt_66.18, tmp146
        pslld   $5, %xmm1   #, tmp146
        psubd   %xmm0, %xmm1    # vect_patt_66.18, tmp147
        pslld   $2, %xmm1   #, tmp148
        paddd   %xmm1, %xmm0    # tmp148, vect_patt_67.19
        movdqa  %xmm4, %xmm1    # vect_vec_iv_.16, vect_patt_68.20
        paddd   %xmm2, %xmm4    # tmp199, vect_vec_iv_.16
        pslld   $2, %xmm0   #, tmp150
        psubd   %xmm0, %xmm1    # tmp150, vect_patt_68.20
        movaps  %xmm1, -16(%rax)    # vect_patt_68.20, MEM[base: _49, offset: 0B]
        cmpq    %rdx, %rax  # _47, ivtmp.49
        jne .L2 #,
        movdqa  .LC3(%rip), %xmm4   #, tmp201
    # fix1.c:16:        b[i] = i % 200;
        pxor    %xmm5, %xmm5    # tmp158
        leaq    80016(%rsp), %rax   #, tmp214
        movdqa  %xmm5, %xmm6    # tmp158, tmp162
        leaq    80000(%rax), %rdx   #, _4
        pcmpgtd %xmm4, %xmm6    # tmp201, tmp162
        .p2align 4,,10
        .p2align 3
    .L3:
    # fix1.c:16:        b[i] = i % 200;
        movdqa  %xmm3, %xmm1    # vect_vec_iv_.9, tmp155
        movdqa  %xmm5, %xmm0    # tmp158, tmp159
        movdqa  %xmm6, %xmm8    # tmp162, tmp164
        addq    $16, %rax   #, ivtmp.43
        punpckldq   %xmm3, %xmm1    # vect_vec_iv_.9, tmp155
        pcmpgtd %xmm1, %xmm0    # tmp155, tmp159
        pmuludq %xmm1, %xmm8    # tmp155, tmp164
        movdqa  %xmm0, %xmm7    # tmp159, tmp163
        movdqa  %xmm1, %xmm0    # tmp155, tmp165
        movdqa  %xmm3, %xmm1    # vect_vec_iv_.9, tmp168
        pmuludq %xmm4, %xmm7    # tmp201, tmp163
        pmuludq %xmm4, %xmm0    # tmp201, tmp165
        punpckhdq   %xmm3, %xmm1    # vect_vec_iv_.9, tmp168
        paddq   %xmm8, %xmm7    # tmp164, tmp163
        movdqa  %xmm6, %xmm8    # tmp162, tmp177
        psllq   $32, %xmm7  #, tmp163
        pmuludq %xmm1, %xmm8    # tmp168, tmp177
        paddq   %xmm7, %xmm0    # tmp163, tmp153
        movdqa  %xmm5, %xmm7    # tmp158, tmp172
        pcmpgtd %xmm1, %xmm7    # tmp168, tmp172
        pmuludq %xmm4, %xmm1    # tmp201, tmp178
        pmuludq %xmm4, %xmm7    # tmp201, tmp176
        paddq   %xmm8, %xmm7    # tmp177, tmp176
        psllq   $32, %xmm7  #, tmp176
        paddq   %xmm7, %xmm1    # tmp176, tmp166
        movdqa  %xmm3, %xmm7    # vect_vec_iv_.9, vect_patt_50.13
        paddd   %xmm2, %xmm3    # tmp199, vect_vec_iv_.9
        shufps  $221, %xmm1, %xmm0  #, tmp166, vect_patt_47.10
        psrad   $6, %xmm0   #, vect_patt_48.11
        movdqa  %xmm0, %xmm1    # vect_patt_48.11, tmp184
        pslld   $1, %xmm1   #, tmp184
        paddd   %xmm0, %xmm1    # vect_patt_48.11, tmp185
        pslld   $3, %xmm1   #, tmp186
        paddd   %xmm1, %xmm0    # tmp186, vect_patt_49.12
        pslld   $3, %xmm0   #, tmp188
        psubd   %xmm0, %xmm7    # tmp188, vect_patt_50.13
        movaps  %xmm7, -16(%rax)    # vect_patt_50.13, MEM[base: _10, offset: 0B]
        cmpq    %rdx, %rax  # _4, ivtmp.43
        jne .L3 #,
        xorl    %eax, %eax  # ivtmp.34
        .p2align 4,,10
        .p2align 3
    .L4:
    # fix1.c:23:        c[i] = a[i] + b[i];
        movdqa  80016(%rsp,%rax), %xmm0 # MEM[symbol: b, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
        movdqa  16(%rsp,%rax), %xmm2    # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
        paddd   %xmm2, %xmm0    # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], MEM[symbol: a, index: ivtmp.34_1, offset: 0B]
        movaps  %xmm2, (%rsp)   # MEM[symbol: a, index: ivtmp.34_1, offset: 0B], %sfp
    # fix1.c:23:        c[i] = a[i] + b[i];
        movaps  %xmm0, 160016(%rsp,%rax)    # vect__5.6, MEM[symbol: c, index: ivtmp.34_1, offset: 0B]
        addq    $16, %rax   #, ivtmp.34
        cmpq    $80000, %rax    #, ivtmp.34
        jne .L4 #,
        leaq    160016(%rsp), %rbx  #, tmp229
        leaq    240016(%rsp), %rbp  #, _39
        .p2align 4,,10
        .p2align 3
    .L5:
    # fix1.c:27:        printf("%d\n", c[i]);
        movl    (%rbx), %esi    # MEM[base: _40, offset: 0B],
        movl    $.LC4, %edi #,
        xorl    %eax, %eax  #
        addq    $4, %rbx    #, ivtmp.29
        call    printf  #
    # fix1.c:26:    for (int i = 0; i < N; i++) {
        cmpq    %rbx, %rbp  # ivtmp.29, _39
        jne .L5 #,
    # fix1.c:31: }
        addq    $240024, %rsp   #,
        .cfi_def_cfa_offset 24
        xorl    %eax, %eax  #
        popq    %rbx    #
        .cfi_def_cfa_offset 16
        popq    %rbp    #
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
    .LFE11:
        .size   main, .-main
        .section    .rodata.cst16,"aM",@progbits,16
        .align 16
    .LC0:
        .long   0
        .long   1
        .long   2
        .long   3
        .align 16
    .LC1:
        .long   4
        .long   4
        .long   4
        .long   4
        .align 16
    .LC2:
        .long   274877907
        .long   274877907
        .long   274877907
        .long   274877907
        .align 16
    .LC3:
        .long   1374389535
        .long   1374389535
        .long   1374389535
        .long   1374389535
        .ident  "GCC: (GNU) 8.3.1 20190223 (Red Hat 8.3.1-2)"
        .section    .note.GNU-stack,"",@progbits