I am trying to auto vectorize a reduction loop with LLVM. But it fails even when I run the same example code given in their documentation

static void iadd(int &R, Vector &A) {
     unsigned sum = 0;
     int a;

     for (int i=0; i<A.vector_elements_16; i++) {
          a = static_cast<int>(A.data_16[i]);
          sum += a ;

      }

     R=static_cast<int>(sum);

    }

Vector class: Has a static array of width 32 and type uint16_t. So iteration count is 32.

Solution

There isn't really enough information in the question to answer it confidently. But I assure you that LLVM and Clang will vectorize reduction loops at least at the top of tree (I haven't checked what old releases do).

The first problem is that vectorization really depends on the architecture. I'm going to use x86-64 and the Haswell microarchitecture (supporting AVX2) in my examples since you didn't list a particular architecture. I'm happy to update my answer for some other architecture if you specify.

The next problem is that your description doesn't sound like a true reduction loop. First off, if the array is static, then I don't really know what this is about -- that's a global. But assuming you meant a member array of length 32, that should be equivalent to the following (somewhat simplified) code that is complete and compiles:

struct V {
  static constexpr int length = 32;
  unsigned short data[32];
};

int reduce(V &v) {
  int sum = 0;
  for (int i = 0; i < v.length; ++i)
    sum += static_cast<int>(v.data[i]);
  return sum;
}

Given this code though, because the length is a constant, LLVM fully unrolls the loop, which means no loop vectorization will come into play. Sadly, we actually generate really terrible code for the unrolled loop -- 32 loads and 32 adds. We could do much, much better. I've filed http://llvm.org/PR28090 to track fixing this.

But this isn't really a reduction loop any more because it has a constant rip count and gets unrolled. If you genuinely have a loop such as the following code:

struct V {
  int length;
  unsigned short *data;
};

int reduce(V &v) {
  int sum = 0;
  for (int i = 0; i < v.length; ++i)
    sum += static_cast<int>(v.data[i]);
  return sum;
}

Then LLVM will actually vectorize this very nicely for Haswell. It loads 8 elements into a vector, extends them to 32 bit values, sums them. It also does 4 of these vectors at a time to fully use the bandwidth of the architecture. Check out the code here:

_Z6reduceR1V:                           # @_Z6reduceR1V
  .cfi_startproc
# BB#0:                                 # %entry
  movslq        (%rdi), %rcx
  xorl          %eax, %eax
  testq         %rcx, %rcx
  jle           .LBB0_11
# BB#1:                                 # %for.body.lr.ph
  movq          8(%rdi), %rdx
  xorl          %edi, %edi
  movl          $0, %eax
  cmpl          $31, %ecx
  jbe           .LBB0_10
# BB#2:                                 # %min.iters.checked
  xorl          %edi, %edi
  movq          %rcx, %r9
  movl          $0, %eax
  andq          $-32, %r9
  je            .LBB0_10
# BB#3:                                 # %vector.body.preheader
  leaq          -32(%r9), %rsi
  shrq          $5, %rsi
  leal          1(%rsi), %r8d
  andl          $1, %r8d
  xorl          %eax, %eax
  testq         %rsi, %rsi
  je            .LBB0_4
# BB#5:                                 # %vector.body.preheader.new
  leaq          -1(%r8), %rdi
  subq          %rsi, %rdi
  vpxor         %ymm0, %ymm0, %ymm0
  xorl          %eax, %eax
  vpxor         %ymm1, %ymm1, %ymm1
  vpxor         %ymm2, %ymm2, %ymm2
  vpxor         %ymm3, %ymm3, %ymm3
  .p2align      4, 0x90
.LBB0_6:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
  vpmovzxwd     (%rdx,%rax,2), %ymm4    # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpmovzxwd     16(%rdx,%rax,2), %ymm5  # ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpmovzxwd     32(%rdx,%rax,2), %ymm6  # ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpmovzxwd     48(%rdx,%rax,2), %ymm7  # ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpaddd        %ymm0, %ymm4, %ymm0
  vpaddd        %ymm1, %ymm5, %ymm1
  vpaddd        %ymm2, %ymm6, %ymm2
  vpaddd        %ymm3, %ymm7, %ymm3
  vpmovzxwd     64(%rdx,%rax,2), %ymm4  # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpmovzxwd     80(%rdx,%rax,2), %ymm5  # ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpmovzxwd     96(%rdx,%rax,2), %ymm6  # ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpmovzxwd     112(%rdx,%rax,2), %ymm7 # ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  leaq          64(%rax), %rax
  vpaddd        %ymm0, %ymm4, %ymm0
  vpaddd        %ymm1, %ymm5, %ymm1
  vpaddd        %ymm2, %ymm6, %ymm2
  vpaddd        %ymm3, %ymm7, %ymm3
  addq          $2, %rdi
  jne           .LBB0_6
  jmp           .LBB0_7
.LBB0_4:
  vpxor         %ymm0, %ymm0, %ymm0
  vpxor         %ymm1, %ymm1, %ymm1
  vpxor         %ymm2, %ymm2, %ymm2
  vpxor         %ymm3, %ymm3, %ymm3
.LBB0_7:                                # %middle.block.unr-lcssa
  testq         %r8, %r8
  je            .LBB0_9
# BB#8:                                 # %middle.block.epilog-lcssa
  vpmovzxwd     48(%rdx,%rax,2), %ymm4  # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpaddd        %ymm3, %ymm4, %ymm3
  vpmovzxwd     32(%rdx,%rax,2), %ymm4  # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpaddd        %ymm2, %ymm4, %ymm2
  vpmovzxwd     16(%rdx,%rax,2), %ymm4  # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpaddd        %ymm1, %ymm4, %ymm1
  vpmovzxwd     (%rdx,%rax,2), %ymm4    # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpaddd        %ymm0, %ymm4, %ymm0
.LBB0_9:                                # %middle.block
  vpaddd        %ymm0, %ymm1, %ymm0
  vpaddd        %ymm0, %ymm2, %ymm0
  vpaddd        %ymm0, %ymm3, %ymm0
  vextracti128  $1, %ymm0, %xmm1
  vpaddd        %ymm1, %ymm0, %ymm0
  vpshufd       $78, %xmm0, %xmm1       # xmm1 = xmm0[2,3,0,1]
  vpaddd        %ymm1, %ymm0, %ymm0
  vphaddd       %ymm0, %ymm0, %ymm0
  vmovd         %xmm0, %eax
  movq          %r9, %rdi
  cmpq          %r9, %rcx
  je            .LBB0_11
  .p2align      4, 0x90
.LBB0_10:                               # %for.body
                                        # =>This Inner Loop Header: Depth=1
  movzwl        (%rdx,%rdi,2), %esi
  addl          %esi, %eax
  addq          $1, %rdi
  cmpq          %rcx, %rdi
  jl            .LBB0_10
.LBB0_11:                               # %for.cond.cleanup
  vzeroupper
  retq