Search code examples
llvmvectorizationreduction

I am trying to auto vectorize a reduction loop with LLVM. But it fails even when I run the same example code given in their documentation


static void iadd(int &R, Vector &A) {
     unsigned sum = 0;
     int a;

     for (int i=0; i<A.vector_elements_16; i++) {
          a = static_cast<int>(A.data_16[i]);
          sum += a ;

      }

     R=static_cast<int>(sum);

    }

Vector class: Has a static array of width 32 and type uint16_t. So iteration count is 32.


Solution

  • There isn't really enough information in the question to answer it confidently. But I assure you that LLVM and Clang will vectorize reduction loops at least at the top of tree (I haven't checked what old releases do).

    The first problem is that vectorization really depends on the architecture. I'm going to use x86-64 and the Haswell microarchitecture (supporting AVX2) in my examples since you didn't list a particular architecture. I'm happy to update my answer for some other architecture if you specify.

    The next problem is that your description doesn't sound like a true reduction loop. First off, if the array is static, then I don't really know what this is about -- that's a global. But assuming you meant a member array of length 32, that should be equivalent to the following (somewhat simplified) code that is complete and compiles:

    struct V {
      static constexpr int length = 32;
      unsigned short data[32];
    };
    
    int reduce(V &v) {
      int sum = 0;
      for (int i = 0; i < v.length; ++i)
        sum += static_cast<int>(v.data[i]);
      return sum;
    }
    

    Given this code though, because the length is a constant, LLVM fully unrolls the loop, which means no loop vectorization will come into play. Sadly, we actually generate really terrible code for the unrolled loop -- 32 loads and 32 adds. We could do much, much better. I've filed http://llvm.org/PR28090 to track fixing this.

    But this isn't really a reduction loop any more because it has a constant rip count and gets unrolled. If you genuinely have a loop such as the following code:

    struct V {
      int length;
      unsigned short *data;
    };
    
    int reduce(V &v) {
      int sum = 0;
      for (int i = 0; i < v.length; ++i)
        sum += static_cast<int>(v.data[i]);
      return sum;
    }
    

    Then LLVM will actually vectorize this very nicely for Haswell. It loads 8 elements into a vector, extends them to 32 bit values, sums them. It also does 4 of these vectors at a time to fully use the bandwidth of the architecture. Check out the code here:

    _Z6reduceR1V:                           # @_Z6reduceR1V
      .cfi_startproc
    # BB#0:                                 # %entry
      movslq        (%rdi), %rcx
      xorl          %eax, %eax
      testq         %rcx, %rcx
      jle           .LBB0_11
    # BB#1:                                 # %for.body.lr.ph
      movq          8(%rdi), %rdx
      xorl          %edi, %edi
      movl          $0, %eax
      cmpl          $31, %ecx
      jbe           .LBB0_10
    # BB#2:                                 # %min.iters.checked
      xorl          %edi, %edi
      movq          %rcx, %r9
      movl          $0, %eax
      andq          $-32, %r9
      je            .LBB0_10
    # BB#3:                                 # %vector.body.preheader
      leaq          -32(%r9), %rsi
      shrq          $5, %rsi
      leal          1(%rsi), %r8d
      andl          $1, %r8d
      xorl          %eax, %eax
      testq         %rsi, %rsi
      je            .LBB0_4
    # BB#5:                                 # %vector.body.preheader.new
      leaq          -1(%r8), %rdi
      subq          %rsi, %rdi
      vpxor         %ymm0, %ymm0, %ymm0
      xorl          %eax, %eax
      vpxor         %ymm1, %ymm1, %ymm1
      vpxor         %ymm2, %ymm2, %ymm2
      vpxor         %ymm3, %ymm3, %ymm3
      .p2align      4, 0x90
    .LBB0_6:                                # %vector.body
                                            # =>This Inner Loop Header: Depth=1
      vpmovzxwd     (%rdx,%rax,2), %ymm4    # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      vpmovzxwd     16(%rdx,%rax,2), %ymm5  # ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      vpmovzxwd     32(%rdx,%rax,2), %ymm6  # ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      vpmovzxwd     48(%rdx,%rax,2), %ymm7  # ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      vpaddd        %ymm0, %ymm4, %ymm0
      vpaddd        %ymm1, %ymm5, %ymm1
      vpaddd        %ymm2, %ymm6, %ymm2
      vpaddd        %ymm3, %ymm7, %ymm3
      vpmovzxwd     64(%rdx,%rax,2), %ymm4  # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      vpmovzxwd     80(%rdx,%rax,2), %ymm5  # ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      vpmovzxwd     96(%rdx,%rax,2), %ymm6  # ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      vpmovzxwd     112(%rdx,%rax,2), %ymm7 # ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      leaq          64(%rax), %rax
      vpaddd        %ymm0, %ymm4, %ymm0
      vpaddd        %ymm1, %ymm5, %ymm1
      vpaddd        %ymm2, %ymm6, %ymm2
      vpaddd        %ymm3, %ymm7, %ymm3
      addq          $2, %rdi
      jne           .LBB0_6
      jmp           .LBB0_7
    .LBB0_4:
      vpxor         %ymm0, %ymm0, %ymm0
      vpxor         %ymm1, %ymm1, %ymm1
      vpxor         %ymm2, %ymm2, %ymm2
      vpxor         %ymm3, %ymm3, %ymm3
    .LBB0_7:                                # %middle.block.unr-lcssa
      testq         %r8, %r8
      je            .LBB0_9
    # BB#8:                                 # %middle.block.epilog-lcssa
      vpmovzxwd     48(%rdx,%rax,2), %ymm4  # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      vpaddd        %ymm3, %ymm4, %ymm3
      vpmovzxwd     32(%rdx,%rax,2), %ymm4  # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      vpaddd        %ymm2, %ymm4, %ymm2
      vpmovzxwd     16(%rdx,%rax,2), %ymm4  # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      vpaddd        %ymm1, %ymm4, %ymm1
      vpmovzxwd     (%rdx,%rax,2), %ymm4    # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
      vpaddd        %ymm0, %ymm4, %ymm0
    .LBB0_9:                                # %middle.block
      vpaddd        %ymm0, %ymm1, %ymm0
      vpaddd        %ymm0, %ymm2, %ymm0
      vpaddd        %ymm0, %ymm3, %ymm0
      vextracti128  $1, %ymm0, %xmm1
      vpaddd        %ymm1, %ymm0, %ymm0
      vpshufd       $78, %xmm0, %xmm1       # xmm1 = xmm0[2,3,0,1]
      vpaddd        %ymm1, %ymm0, %ymm0
      vphaddd       %ymm0, %ymm0, %ymm0
      vmovd         %xmm0, %eax
      movq          %r9, %rdi
      cmpq          %r9, %rcx
      je            .LBB0_11
      .p2align      4, 0x90
    .LBB0_10:                               # %for.body
                                            # =>This Inner Loop Header: Depth=1
      movzwl        (%rdx,%rdi,2), %esi
      addl          %esi, %eax
      addq          $1, %rdi
      cmpq          %rcx, %rdi
      jl            .LBB0_10
    .LBB0_11:                               # %for.cond.cleanup
      vzeroupper
      retq