performance assembly optimization x86 clang++

Counting differences between 2 buffers seems too slow

My problem

I have 2 adjacent buffers of bytes of identical size (around 20 MB each). I just want to count the differences between them.

My question

How much time this loop should take to run on a 4.8GHz Intel I7 9700K with 3600MT RAM ?

How do we compute max theoretical speed ?

What I tried

uint64_t compareFunction(const char *const __restrict buffer, const uint64_t commonSize)
{
    uint64_t diffFound = 0;

    for(uint64_t byte = 0; byte < commonSize; ++byte)
        diffFound += static_cast<uint64_t>(buffer[byte] != buffer[byte + commonSize]);

    return diffFound;
}

It takes 11ms on my PC (9700K 4.8Ghz RAM 3600 Windows 10 Clang 14.0.6 -O3 MinGW ) and I feel it is too slow and that I am missing something.

40MB should take less than 2ms to be read on the CPU (my RAM bandwidth is between 20 and 30GB/s)

I don't know how to count cycles required to execute one iteration (especially because CPUs are superscalar nowadays). If I assume 1 cycle per operation and if I don't mess up my counting, it should be 10 ops per iteration -> 200 million ops -> at 4.8 Ghz with only one execution unit -> 40ms. Obviously I am wrong on how to compute the number of cycles per loop.

Fun fact: I tried on Linux PopOS GCC 11.2 -O3 and it ran at 4.5ms. Why such a difference?

Here are the dissassemblies vectorised and scalar produced by clang:

compareFunction(char const*, unsigned long): # @compareFunction(char const*, unsigned long)
        test    rsi, rsi
        je      .LBB0_1
        lea     r8, [rdi + rsi]
        neg     rsi
        xor     edx, edx
        xor     eax, eax
.LBB0_4:                                # =>This Inner Loop Header: Depth=1
        movzx   r9d, byte ptr [rdi + rdx]
        xor     ecx, ecx
        cmp     r9b, byte ptr [r8 + rdx]
        setne   cl
        add     rax, rcx
        add     rdx, 1
        mov     rcx, rsi
        add     rcx, rdx
        jne     .LBB0_4
        ret
.LBB0_1:
        xor     eax, eax
        ret

Clang14 O3:

.LCPI0_0:
        .quad   1                               # 0x1
        .quad   1                               # 0x1
compareFunction(char const*, unsigned long):                # @compareFunction(char const*, unsigned long)
        test    rsi, rsi
        je      .LBB0_1
        cmp     rsi, 4
        jae     .LBB0_4
        xor     r9d, r9d
        xor     eax, eax
        jmp     .LBB0_11
.LBB0_1:
        xor     eax, eax
        ret
.LBB0_4:
        mov     r9, rsi
        and     r9, -4
        lea     rax, [r9 - 4]
        mov     r8, rax
        shr     r8, 2
        add     r8, 1
        test    rax, rax
        je      .LBB0_5
        mov     rdx, r8
        and     rdx, -2
        lea     r10, [rdi + 6]
        lea     r11, [rdi + rsi]
        add     r11, 6
        pxor    xmm0, xmm0
        xor     eax, eax
        pcmpeqd xmm2, xmm2
        movdqa  xmm3, xmmword ptr [rip + .LCPI0_0] # xmm3 = [1,1]
        pxor    xmm1, xmm1
.LBB0_7:                                # =>This Inner Loop Header: Depth=1
        movzx   ecx, word ptr [r10 + rax - 6]
        movd    xmm4, ecx
        movzx   ecx, word ptr [r10 + rax - 4]
        movd    xmm5, ecx
        movzx   ecx, word ptr [r11 + rax - 6]
        movd    xmm6, ecx
        pcmpeqb xmm6, xmm4
        movzx   ecx, word ptr [r11 + rax - 4]
        movd    xmm7, ecx
        pcmpeqb xmm7, xmm5
        pxor    xmm6, xmm2
        punpcklbw       xmm6, xmm6              # xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm4, xmm6, 212                 # xmm4 = xmm6[0,1,1,3,4,5,6,7]
        pshufd  xmm4, xmm4, 212                 # xmm4 = xmm4[0,1,1,3]
        pand    xmm4, xmm3
        paddq   xmm4, xmm0
        pxor    xmm7, xmm2
        punpcklbw       xmm7, xmm7              # xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm0, xmm7, 212                 # xmm0 = xmm7[0,1,1,3,4,5,6,7]
        pshufd  xmm5, xmm0, 212                 # xmm5 = xmm0[0,1,1,3]
        pand    xmm5, xmm3
        paddq   xmm5, xmm1
        movzx   ecx, word ptr [r10 + rax - 2]
        movd    xmm0, ecx
        movzx   ecx, word ptr [r10 + rax]
        movd    xmm1, ecx
        movzx   ecx, word ptr [r11 + rax - 2]
        movd    xmm6, ecx
        pcmpeqb xmm6, xmm0
        movzx   ecx, word ptr [r11 + rax]
        movd    xmm7, ecx
        pcmpeqb xmm7, xmm1
        pxor    xmm6, xmm2
        punpcklbw       xmm6, xmm6              # xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm0, xmm6, 212                 # xmm0 = xmm6[0,1,1,3,4,5,6,7]
        pshufd  xmm0, xmm0, 212                 # xmm0 = xmm0[0,1,1,3]
        pand    xmm0, xmm3
        paddq   xmm0, xmm4
        pxor    xmm7, xmm2
        punpcklbw       xmm7, xmm7              # xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm1, xmm7, 212                 # xmm1 = xmm7[0,1,1,3,4,5,6,7]
        pshufd  xmm1, xmm1, 212                 # xmm1 = xmm1[0,1,1,3]
        pand    xmm1, xmm3
        paddq   xmm1, xmm5
        add     rax, 8
        add     rdx, -2
        jne     .LBB0_7
        test    r8b, 1
        je      .LBB0_10
.LBB0_9:
        movzx   ecx, word ptr [rdi + rax]
        movd    xmm2, ecx
        movzx   ecx, word ptr [rdi + rax + 2]
        movd    xmm3, ecx
        add     rax, rsi
        movzx   ecx, word ptr [rdi + rax]
        movd    xmm4, ecx
        pcmpeqb xmm4, xmm2
        movzx   eax, word ptr [rdi + rax + 2]
        movd    xmm2, eax
        pcmpeqb xmm2, xmm3
        pcmpeqd xmm3, xmm3
        pxor    xmm4, xmm3
        punpcklbw       xmm4, xmm4              # xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm4, xmm4, 212                 # xmm4 = xmm4[0,1,1,3,4,5,6,7]
        pshufd  xmm4, xmm4, 212                 # xmm4 = xmm4[0,1,1,3]
        movdqa  xmm5, xmmword ptr [rip + .LCPI0_0] # xmm5 = [1,1]
        pand    xmm4, xmm5
        paddq   xmm0, xmm4
        pxor    xmm2, xmm3
        punpcklbw       xmm2, xmm2              # xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
        pshuflw xmm2, xmm2, 212                 # xmm2 = xmm2[0,1,1,3,4,5,6,7]
        pshufd  xmm2, xmm2, 212                 # xmm2 = xmm2[0,1,1,3]
        pand    xmm2, xmm5
        paddq   xmm1, xmm2
.LBB0_10:
        paddq   xmm0, xmm1
        pshufd  xmm1, xmm0, 238                 # xmm1 = xmm0[2,3,2,3]
        paddq   xmm1, xmm0
        movq    rax, xmm1
        cmp     r9, rsi
        je      .LBB0_13
.LBB0_11:
        lea     r8, [r9 + rsi]
        sub     rsi, r9
        add     r8, rdi
        add     rdi, r9
        xor     edx, edx
.LBB0_12:                               # =>This Inner Loop Header: Depth=1
        movzx   r9d, byte ptr [rdi + rdx]
        xor     ecx, ecx
        cmp     r9b, byte ptr [r8 + rdx]
        setne   cl
        add     rax, rcx
        add     rdx, 1
        cmp     rsi, rdx
        jne     .LBB0_12
.LBB0_13:
        ret
.LBB0_5:
        pxor    xmm0, xmm0
        xor     eax, eax
        pxor    xmm1, xmm1
        test    r8b, 1
        jne     .LBB0_9
        jmp     .LBB0_10

Solution

TLDR: the reason why the Clang code is so slow comes from a poor vectorization method saturating the port 5 (known to be often an issue). GCC does a better job here, but it is still far from being efficient. One can write a much faster chunk-based code using AVX-2 not saturating the port 5.

Analysis of the unvectorized Clang code

To understand what is going on it is better to start with a simple example. Indeed, as you said, modern processor are superscalar so it is not easy to understand the speed of some generated code on such architecture.

The code generated by Clang using the -O1 optimization flag is a good start. Here is the code of the hot loop produced by GodBold provided in your question:

(instructions)                                 (ports)

.LBB0_4:
        movzx   r9d, byte ptr [rdi + rdx]      p23
        xor     ecx, ecx                       p0156
        cmp     r9b, byte ptr [r8 + rdx]       p0156+p23
        setne   cl                             p06
        add     rax, rcx                       p0156
        add     rdx, 1                         p0156
        mov     rcx, rsi                       (optimized)
        add     rcx, rdx                       p0156
        jne     .LBB0_4                        p06

Modern processors like the Coffee Lake 9700K are structured in two big parts: a front-end fetching/decoding the instructions (and splitting them into micro-instructions, aka. uops), and a back-end scheduling/executing them. The back-end schedule the uops on many ports and each of them can execute some specific sets of instructions (eg. only memory load, or only arithmetic instruction). For each instruction, I put the ports that can execute them. p0156+p23 means the instruction is split in two uops: the first can be executed by the ports 0 or 1 or 5 or 6, and the second can be executed by the ports 2 or 3. Note that the front-end can somehow optimize the code so not to produce any uops for basic instructions like the mov in the loop (thanks to a mechanism called register renaming).

For each loop iteration, the processor needs to read 2 value from memory. A Coffee Lake processor like the 9700K can load two values per cycle so the loop will at least take 1 cycle/iteration (assuming the loads in r9d and r9b does not conflict due to the use of different part of the same r9 64-bit register). This processor has a uops cache and the loop has a lot of instructions so the decoding part should not be a problem. That being said, there is 9 uops to execute and the processor can only execute 6 of them per cycle so the loop cannot take less than 1.5 cycle/iteration. More precisely, the ports 0, 1, 5 and 6 are under pressure, so even assuming the processor perfectly load balance the uops, 2 cycle/iterations are needed. This is an optimistic lower-bound execution time since the processor may not perfectly schedule the instruction and there are many things that could possibly go wrong (like a sneaky hidden dependency I did not see). With a frequency of 4.8GHz, the final execution time is at least 8.3 ms. It can reach 12.5 ms with 3 cycle/iteration (note that 2.5 cycle/iteration is possible due to the scheduling of uops to ports).

The loop can be improved using unrolling. Indeed, a significant number of instructions are needed just to do the loop and not the actual computation. Unrolling can help to increase the ratio of useful instructions so to make a better usage of available ports. Still, the 2 loads prevent the loop to be faster than 1 cycle/iteration, that is 4.2 ms.

Analysis of the vectorized Clang code

The vectorized code generated by Clang is complex. One could try to apply the same analysis than in the previous code but it would be a tedious task.

One can note that even though the code is vectorized, the loads are not vectorized. This is an issue since only 2 loads can be done per cycle. That being said, loads are performed by pairs two contiguous char values so loads are not so slow compared to the previously generated code.

Clang does that since only two 64-bit values can fit in a 128-bit SSE register and a 64-bit and it needs to do that because diffFound is a 64-bit integer. The 8-bit to 64-bit conversion is the biggest issue in the code because it requires several SSE instructions to do the conversion. Moreover, only 4 integers can be computed at a time since there is 3 SSE integer units on Coffee Lake and each of them can only compute two 64-bit integers at a time. In the end, Clang only put 2 values in each SSE register (and use 4 of them so to compute 8 items per loop iteration) so one should expect a code running more than twice faster (especially due to SSE and the loop unrolling), but this is not much the case due to fewer SSE ports than ALU ports and a more instructions required for the type conversions. Put it shortly, the vectorization is clearly inefficient, but this is not so easy for Clang to generate an efficient code in this case. Still, with 28 SSE instructions and 3 SSE integer units computing 8 items per loop, one should expect the computing part of the code to take about 28/3/8 ~= 1.2 cycle/item which is far from what you can observe (and this is not due to other instruction since they can mostly be executed in parallel as they can mostly be scheduled on other ports).

In fact, the performance issue certainly comes from the saturation of the port 5. Indeed, this port is the only one that can shuffle items of SIMD registers. Thus, the instructions punpcklbw, pshuflw, pshufd and even the movd can only be executed on the port 5. This is a pretty common issue with SIMD codes. This is a big issue since there is 20 instructions per loop and the processor may not even use it perfectly. This means the code should take at least 10.4 ms which is very close to the observed execution time (11 ms).

Analysis of the vectorized GCC code

The code generated by GCC is actually pretty good compared to the one of Clang. Firstly, GCC loads items using SIMD instruction directly which is much more efficient as 16 items are computed per instruction (and by iteration): it only need 2 load uops per iteration reducing the pressure on the port 2 and 3 (1 cycle/iteration for that, so 0.0625 cycle/item). Secondly, GCC only uses 14 punpckhwd instructions while each iteration compute 16 items, reducing critical pressure on the port 5 (0.875 cycle/item for that). Thirdly, the SIMD registers are nearly fully used, at least for the comparison since the pcmpeqb comparison instruction compare 16 items at a time (as opposed to 2 with Clang). The other instructions like paddq are cheap (for example, paddq can be scheduled on the 3 SSE ports) and they should not impact much the execution time. In the end, this version should still be bounded by the port 5, but it should be much faster than the Clang version. Indeed, one should expect the execution time to reach 1 cycle/item (since the port scheduling is certainly not perfect and memory loads may introduce some stalling cycles). This means an execution time of 4.2 ms. This is close to the observed results.

Faster implementation

The GCC implementation is not perfect.

First of all, it does not use AVX2 supported by your processor since the -mavx2 flag is not provided (or any similar flag like -march=native). Indeed, GCC like other mainstream compilers only use SSE2 by default for sake of compatibility with previous architecture: SSE2 is safe to use on all x86-64 processors, but not other instruction sets like SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2. With such flag, GCC should be able to produce a memory bound code.

Moreover, the compiler could theoretically perform a multi-level sum reduction. The idea is to accumulate the result of the comparison in a 8-bit wide SIMD lane using chunks with a size of 1024 items (ie. 64x16 items). This is safe since the value of each lane cannot exceed 64. To avoid overflow, the accumulated values needs to be stored in wider SIMD lanes (eg. 64-bit ones). With this strategy, the overhead of the punpckhwd instructions is 64 time smaller. This is a big improvement since it removes the saturation of the port 5. This strategy should be sufficient to generate a memory-bound code, even using only SSE2. Here is an example of untested code requiring the flag -fopenmp-simd to be efficient.

uint64_t compareFunction(const char *const __restrict buffer, const uint64_t commonSize)
{
    uint64_t byteChunk = 0;
    uint64_t diffFound = 0;

    if(commonSize >= 127)
    {
        for(; byteChunk < commonSize-127; byteChunk += 128)
        {
            uint8_t tmpDiffFound = 0;
            #pragma omp simd reduction(+:tmpDiffFound)
            for(uint64_t byte = byteChunk; byte < byteChunk + 128; ++byte)
                tmpDiffFound += buffer[byte] != buffer[byte + commonSize];
            diffFound += tmpDiffFound;
        }
    }

    for(uint64_t byte = byteChunk; byte < commonSize; ++byte)
        diffFound += buffer[byte] != buffer[byte + commonSize];

    return diffFound;
}

Both GCC and Clang generates a rather efficient code (while sub-optimal for data fitting in the cache), especially Clang. Here is for example the code generated by Clang using AVX2:

.LBB0_4:
        lea     r10, [rdx + 128]
        vmovdqu ymm2, ymmword ptr [r9 + rdx - 96]
        vmovdqu ymm3, ymmword ptr [r9 + rdx - 64]
        vmovdqu ymm4, ymmword ptr [r9 + rdx - 32]
        vpcmpeqb        ymm2, ymm2, ymmword ptr [rcx + rdx - 96]
        vpcmpeqb        ymm3, ymm3, ymmword ptr [rcx + rdx - 64]
        vpcmpeqb        ymm4, ymm4, ymmword ptr [rcx + rdx - 32]
        vmovdqu ymm5, ymmword ptr [r9 + rdx]
        vpaddb  ymm2, ymm4, ymm2
        vpcmpeqb        ymm4, ymm5, ymmword ptr [rcx + rdx]
        vpaddb  ymm3, ymm4, ymm3
        vpaddb  ymm2, ymm3, ymm2
        vpaddb  ymm2, ymm2, ymm0
        vextracti128    xmm3, ymm2, 1
        vpaddb  xmm2, xmm2, xmm3
        vpshufd xmm3, xmm2, 238
        vpaddb  xmm2, xmm2, xmm3
        vpsadbw xmm2, xmm2, xmm1
        vpextrb edx, xmm2, 0
        add     rax, rdx
        mov     rdx, r10
        cmp     r10, r8
        jb      .LBB0_4

All the loads are 256-bit SIMD ones. The number of vpcmpeqb is optimal. The number of vpaddb is relatively good. There are few other instructions, but they should clearly not be a bottleneck. The loop operate on 128 items per iteration and I expect it to takes less than a dozen of cycles per iteration for data already in the cache (otherwise it should be completely memory-bound). This means <0.1 cycle/item, that is, far less than the previous implementation. In fact, the uiCA tool indicates about 0.055 cycle/item, that is 81 GiB/s! One may manually write a better code using SIMD intrinsics, but at the expense of a significantly worse portability, maintenance and readability.

Note that generating a sequential memory-bound does not always mean the RAM throughput will be saturated. In fact, on one core, there is sometimes not enough concurrency to hide the latency of memory operations though it should be fine on your processor (like it is on my i5-9600KF with 2 interleaved 3200 MHz DDR4 memory channels).