static void iadd(int &R, Vector &A) {
unsigned sum = 0;
int a;
for (int i=0; i<A.vector_elements_16; i++) {
a = static_cast<int>(A.data_16[i]);
sum += a ;
}
R=static_cast<int>(sum);
}
Vector class: Has a static array of width 32 and type uint16_t. So iteration count is 32.
There isn't really enough information in the question to answer it confidently. But I assure you that LLVM and Clang will vectorize reduction loops at least at the top of tree (I haven't checked what old releases do).
The first problem is that vectorization really depends on the architecture. I'm going to use x86-64 and the Haswell microarchitecture (supporting AVX2) in my examples since you didn't list a particular architecture. I'm happy to update my answer for some other architecture if you specify.
The next problem is that your description doesn't sound like a true reduction loop. First off, if the array is static, then I don't really know what this is about -- that's a global. But assuming you meant a member array of length 32, that should be equivalent to the following (somewhat simplified) code that is complete and compiles:
struct V {
static constexpr int length = 32;
unsigned short data[32];
};
int reduce(V &v) {
int sum = 0;
for (int i = 0; i < v.length; ++i)
sum += static_cast<int>(v.data[i]);
return sum;
}
Given this code though, because the length is a constant, LLVM fully unrolls the loop, which means no loop vectorization will come into play. Sadly, we actually generate really terrible code for the unrolled loop -- 32 loads and 32 adds. We could do much, much better. I've filed http://llvm.org/PR28090 to track fixing this.
But this isn't really a reduction loop any more because it has a constant rip count and gets unrolled. If you genuinely have a loop such as the following code:
struct V {
int length;
unsigned short *data;
};
int reduce(V &v) {
int sum = 0;
for (int i = 0; i < v.length; ++i)
sum += static_cast<int>(v.data[i]);
return sum;
}
Then LLVM will actually vectorize this very nicely for Haswell. It loads 8 elements into a vector, extends them to 32 bit values, sums them. It also does 4 of these vectors at a time to fully use the bandwidth of the architecture. Check out the code here:
_Z6reduceR1V: # @_Z6reduceR1V
.cfi_startproc
# BB#0: # %entry
movslq (%rdi), %rcx
xorl %eax, %eax
testq %rcx, %rcx
jle .LBB0_11
# BB#1: # %for.body.lr.ph
movq 8(%rdi), %rdx
xorl %edi, %edi
movl $0, %eax
cmpl $31, %ecx
jbe .LBB0_10
# BB#2: # %min.iters.checked
xorl %edi, %edi
movq %rcx, %r9
movl $0, %eax
andq $-32, %r9
je .LBB0_10
# BB#3: # %vector.body.preheader
leaq -32(%r9), %rsi
shrq $5, %rsi
leal 1(%rsi), %r8d
andl $1, %r8d
xorl %eax, %eax
testq %rsi, %rsi
je .LBB0_4
# BB#5: # %vector.body.preheader.new
leaq -1(%r8), %rdi
subq %rsi, %rdi
vpxor %ymm0, %ymm0, %ymm0
xorl %eax, %eax
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
.p2align 4, 0x90
.LBB0_6: # %vector.body
# =>This Inner Loop Header: Depth=1
vpmovzxwd (%rdx,%rax,2), %ymm4 # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxwd 16(%rdx,%rax,2), %ymm5 # ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxwd 32(%rdx,%rax,2), %ymm6 # ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxwd 48(%rdx,%rax,2), %ymm7 # ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpaddd %ymm0, %ymm4, %ymm0
vpaddd %ymm1, %ymm5, %ymm1
vpaddd %ymm2, %ymm6, %ymm2
vpaddd %ymm3, %ymm7, %ymm3
vpmovzxwd 64(%rdx,%rax,2), %ymm4 # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxwd 80(%rdx,%rax,2), %ymm5 # ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxwd 96(%rdx,%rax,2), %ymm6 # ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxwd 112(%rdx,%rax,2), %ymm7 # ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
leaq 64(%rax), %rax
vpaddd %ymm0, %ymm4, %ymm0
vpaddd %ymm1, %ymm5, %ymm1
vpaddd %ymm2, %ymm6, %ymm2
vpaddd %ymm3, %ymm7, %ymm3
addq $2, %rdi
jne .LBB0_6
jmp .LBB0_7
.LBB0_4:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
.LBB0_7: # %middle.block.unr-lcssa
testq %r8, %r8
je .LBB0_9
# BB#8: # %middle.block.epilog-lcssa
vpmovzxwd 48(%rdx,%rax,2), %ymm4 # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpaddd %ymm3, %ymm4, %ymm3
vpmovzxwd 32(%rdx,%rax,2), %ymm4 # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpaddd %ymm2, %ymm4, %ymm2
vpmovzxwd 16(%rdx,%rax,2), %ymm4 # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpaddd %ymm1, %ymm4, %ymm1
vpmovzxwd (%rdx,%rax,2), %ymm4 # ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpaddd %ymm0, %ymm4, %ymm0
.LBB0_9: # %middle.block
vpaddd %ymm0, %ymm1, %ymm0
vpaddd %ymm0, %ymm2, %ymm0
vpaddd %ymm0, %ymm3, %ymm0
vextracti128 $1, %ymm0, %xmm1
vpaddd %ymm1, %ymm0, %ymm0
vpshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
vpaddd %ymm1, %ymm0, %ymm0
vphaddd %ymm0, %ymm0, %ymm0
vmovd %xmm0, %eax
movq %r9, %rdi
cmpq %r9, %rcx
je .LBB0_11
.p2align 4, 0x90
.LBB0_10: # %for.body
# =>This Inner Loop Header: Depth=1
movzwl (%rdx,%rdi,2), %esi
addl %esi, %eax
addq $1, %rdi
cmpq %rcx, %rdi
jl .LBB0_10
.LBB0_11: # %for.cond.cleanup
vzeroupper
retq