Structure of SSE vectorization calls for summing vector of floats

This question was brought up by the recent question Why won't simple code get auto-vectorized with SSE and AVX in modern compilers?. Not having delved into SSE intrinsic, the question arose, "How best to structure the loads, adds and stores for summing larger arrays" as there were several naive approaches that seemed to be available.

Given the same array of floats from the linked question above, but a longer array where the sum of loads and adds would have to continually update the solution array to obtain the final sums.

First Thought On Extending

My first thought on extending to get a final sum for a larger array was simply creating a temporary array of 4 floats to hold the sum of each 128 bit operation and then add that to the current solution array keeping a running total within a loop from the caller, e.g.

#include <stdio.h>
#include <xmmintrin.h>

void compute (const float *a, const float *b, float *c)
{
    __m128 va = _mm_loadu_ps(a);
    __m128 vb = _mm_loadu_ps(b);
    __m128 vc = _mm_add_ps(va, vb);
    _mm_storeu_ps(c, vc);
}

int main (void) {

  float a[] = { 1.1, 2.2, 3.3, 4.4, 1.1, 2.2, 3.3, 4.4 },
        b[] = { 1.1, 2.2, 3.3, 4.4, 1.1, 2.2, 3.3, 4.4 },
        c[4] = { 0 };

  for (int i = 0; i < 8; i += 4) {
    float tmp[4] = { 0 };
    compute (a + i, b + i, tmp);
    compute (c, tmp, c);
  }

  for (int i = 0; i < 4; i++) {
    printf ("c[%d]: %5.2f\n", i, c[i]);
  }
}

My concern was this added the overhead of 4 calls to the compute() function, so a second naive approach extending the compute() function to handle the update of c the solution array within the compute function itself, which led to my:

Second Thought on Extending

#include <stdio.h>
#include <xmmintrin.h>

void compute (const float *a, const float *b, float *c)
{
    __m128 va = _mm_loadu_ps(a);
    __m128 vb = _mm_loadu_ps(b);
    __m128 vt = _mm_add_ps(va, vb);
    
    __m128 vc = _mm_loadu_ps(c);
    __m128 vr = _mm_add_ps(vt, vc);
    
    _mm_storeu_ps(c, vr);
}

int main (void) {
  
  float a[] = { 1.1, 2.2, 3.3, 4.4, 1.1, 2.2, 3.3, 4.4 },
        b[] = { 1.1, 2.2, 3.3, 4.4, 1.1, 2.2, 3.3, 4.4 },
        c[4] = { 0 };

  for (int i = 0; i < 8; i += 4) {
    compute (a + i, b + i, c);
  }
  
  for (int i = 0; i < 4; i++) {
    printf ("c[%d]: %5.2f\n", i, c[i]);
  }
}

Which raised the question of "Do the changes even matter?"

Surprisingly (or maybe not for those more experienced with vectorization), but the assembly produced by both was virtually identical.

Assembly for the 1st Attempt

        .file   "sse-xmm-test-2.c"
        .intel_syntax noprefix
        .text
        .p2align 4
        .globl  compute
        .type   compute, @function
compute:
.LFB529:
        .cfi_startproc
        vmovups xmm0, XMMWORD PTR [rsi]
        vaddps  xmm0, xmm0, XMMWORD PTR [rdi]
        vmovups XMMWORD PTR [rdx], xmm0
        ret
        .cfi_endproc
.LFE529:
        .size   compute, .-compute
        .section        .rodata.str1.1,"aMS",@progbits,1
.LC1:
        .string "c[%d]: %5.2f\n"
        .section        .text.startup,"ax",@progbits
        .p2align 4
        .globl  main
        .type   main, @function
main:
.LFB530:
        .cfi_startproc
        push    rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        push    rbx
        .cfi_def_cfa_offset 24
        .cfi_offset 3, -24
        xor     ebx, ebx
        sub     rsp, 24
        .cfi_def_cfa_offset 48
        vmovaps xmm0, XMMWORD PTR .LC0[rip]
        mov     rbp, rsp
        vmovaps XMMWORD PTR [rsp], xmm0
.L4:
        mov     esi, ebx
        vxorpd  xmm1, xmm1, xmm1
        mov     edi, OFFSET FLAT:.LC1
        vcvtss2sd       xmm0, xmm1, DWORD PTR [rbp+0+rbx*4]
        mov     eax, 1
        add     rbx, 1
        call    printf
        cmp     rbx, 4
        jne     .L4
        add     rsp, 24
        .cfi_def_cfa_offset 24
        xor     eax, eax
        pop     rbx
        .cfi_def_cfa_offset 16
        pop     rbp
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
.LFE530:
        .size   main, .-main
        .section        .rodata.cst16,"aM",@progbits,16
        .align 16
.LC0:
        .long   1082969293
        .long   1091357901
        .long   1095971635
        .long   1099746509
        .ident  "GCC: (SUSE Linux) 14.2.0"
        .section        .note.GNU-stack,"",@progbits

And the virtually identical assembly for the second approach that adds an additional vaddps for the running sum in the compute() function, but otherwise the assembly looks the same:

        .file   "sse-xmm-test-3.c"
        .intel_syntax noprefix
        .text
        .p2align 4
        .globl  compute
        .type   compute, @function
compute:
.LFB529:
        .cfi_startproc
        vmovups xmm0, XMMWORD PTR [rdx]
        vaddps  xmm0, xmm0, XMMWORD PTR [rsi]
        vaddps  xmm0, xmm0, XMMWORD PTR [rdi]
        vmovups XMMWORD PTR [rdx], xmm0
        ret
        .cfi_endproc
.LFE529:
        .size   compute, .-compute
        .section        .rodata.str1.1,"aMS",@progbits,1
.LC1:
        .string "c[%d]: %5.2f\n"
        .section        .text.startup,"ax",@progbits
        .p2align 4
        .globl  main
        .type   main, @function
main:
.LFB530:
        .cfi_startproc
        push    rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        push    rbx
        .cfi_def_cfa_offset 24
        .cfi_offset 3, -24
        xor     ebx, ebx
        sub     rsp, 24
        .cfi_def_cfa_offset 48
        vmovaps xmm0, XMMWORD PTR .LC0[rip]
        mov     rbp, rsp
        vmovaps XMMWORD PTR [rsp], xmm0
.L4:
        mov     esi, ebx
        vxorpd  xmm1, xmm1, xmm1
        mov     edi, OFFSET FLAT:.LC1
        vcvtss2sd       xmm0, xmm1, DWORD PTR [rbp+0+rbx*4]
        mov     eax, 1
        add     rbx, 1
        call    printf
        cmp     rbx, 4
        jne     .L4
        add     rsp, 24
        .cfi_def_cfa_offset 24
        xor     eax, eax
        pop     rbx
        .cfi_def_cfa_offset 16
        pop     rbp
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
.LFE530:
        .size   main, .-main
        .section        .rodata.cst16,"aM",@progbits,16
        .align 16
.LC0:
        .long   1082969293
        .long   1091357901
        .long   1095971635
        .long   1099746509
        .ident  "GCC: (SUSE Linux) 14.2.0"
        .section        .note.GNU-stack,"",@progbits

So other than the second approach adding a vaddps using rsi in the compute() function, the rest seems to be optimized identically.

What I hoped to find was one approach showing the compiler was better able to optimize one or the other, but it appears it is a basic wash.

Beyond comparing the assembly produced, is the any general principle for handling summing of SSE vectors that would prefer one of the approaches over the other?

Solution

There are no vaddps instructions (or call compute) in either main; constant-propagation did all the addition at compile-time for your tiny arrays with compile-time-constant inputs.

So we can't tell from that how well GCC or Clang would do at optimizing away tmp[] and the extra stores/reloads you're introducing. It's not unreasonable to hope that they would manage to see through this and not make bad asm like you'd get from a compiler taking all those _mm_storeu_ps intrinsics literally (I'd worry about MSVC or ICC here, although maybe they can also optimize loads and stores).

Normally you'd just return a __m128, or use intrinsics and __m128 variables inside a loop, instead of trying to factor out a compute function whose API was just pointers.

Using a more useful function around the loop from your main, we can see how it will compile for runtime-variable inputs. For example this is one of them:

void foo(const float *restrict a, const float *restrict b, float *restrict c)
{
  for (int i = 0; i < 8; i += 4) {
    float tmp[4] = { 0 };  // optimized away, no stack space used
    compute (a + i, b + i, tmp);
    compute (c, tmp, c);
  }
}

Godbolt, compiled with GCC14.2 -O3 -march=x86-64-v3 to allow AVX for more compact code with unaligned memory-source vaddps, not needing separate movups instructions like we get for baseline x86-64 (SSE2).

compute:
        vmovups xmm0, XMMWORD PTR [rsi]
        vaddps  xmm0, xmm0, XMMWORD PTR [rdi]
        vmovups XMMWORD PTR [rdx], xmm0
        ret
foo:                        ## Your first version
        vmovups xmm0, XMMWORD PTR [rsi]
        vmovups xmm1, XMMWORD PTR [rsi+16]
        vaddps  xmm0, xmm0, XMMWORD PTR [rdi]
        vaddps  xmm1, xmm1, XMMWORD PTR [rdi+16]
        vaddps  xmm0, xmm0, XMMWORD PTR [rdx]
        vaddps  xmm0, xmm0, xmm1
        vmovups XMMWORD PTR [rdx], xmm0
        ret

compute_v2:
        vmovups xmm0, XMMWORD PTR [rsi]
        vaddps  xmm0, xmm0, XMMWORD PTR [rdi]  # add a with b
        vaddps  xmm0, xmm0, XMMWORD PTR [rdx]  # add with c
        vmovups XMMWORD PTR [rdx], xmm0        # and store back into c[]
        ret
bar:             ## Your second version
       identical asm to foo

So both compile the same way, compilers were able to see through all the loads and stores and keep tmp[] and c[] in XMM vector registers.

Both include useless adds into c[] instead of just writing it as a pure destination. Adding to +0.0 isn't a no-op in IEEE FP semantics. (-0.0 + +0.0 gives +0.0). With -ffast-math, if c[] = {0} was visible at compile time then that addition could be optimized away. In the version of the function as written, float *c is read then written, could be holding anything. I guess I could introduce another temporary sum which we copy from into the actual c after the loop. (In which case we might not even need restrict since we'd only be reading pointed-to memory and writing private local vars whose address hasn't escaped.)

To remove the redundant add in the first version, you could manually peel the first loop iteration out of the loop as compute (a + 0, b + 0, c + 0); to write directly into c[]. Then later additions into c inside the loop would be what you want, like vector_sum += load(a+i) + load(b+i). Example on Godbolt for your code, with __m128 local variables inside the loop, with vc initialized in a way that doesn't depend on the initial contents of the output, so we can actually return a __m128 by value or have a float *c output pointer be write-only.

Of course without -ffast-math, even clang isn't going to be able to use more vector accumulators to hide FP latency for larger arrays, so you'll still bottleneck on at best 1 addps per 3 or 4 cycles, instead of the 2/clock throughput for loads and adds on typical CPUs (requiring at least 6 or 8 vector accumulators, the latency x throughput product, which you sum down to 1 vector after the loop). Same as for a dot-product but with add instead of FMA: see Why does mulss take only 3 cycles on Haswell, different from Agner's instruction tables? (Unrolling FP loops with multiple accumulators)

Normally for summing an array you'd reducing the final 4 or 8-element vector down to scalar. Your c[] elements hold the sum of every 4th element of a[]+b[]. Which is fine if you're just looking at the vertical-SIMD part of summing an array; the final horizontal sum is a separate problem.