Why is numba popcount code twice as fast as equivalent C code?

I have this simple python/numba code:

from numba import njit
import numba as nb
@nb.njit(nb.uint64(nb.uint64))
def popcount(x): 
      b=0
      while(x > 0):
          x &= x - nb.uint64(1)   
          b+=1
      return b
@njit
def timed_loop(n):
    summand = 0
    for i in range(n):
        summand += popcount(i)
    return summand

It just adds the popcounts for integers 0 to n - 1.

When I time it I get:

%timeit timed_loop(1000000)
340 µs ± 1.08 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

It turns out that llvm cleverly converts the popcount function into the native CPU POPCNT instruction so we should expect it to be fast. But the question is, how fast.

I thought I would compare it to a C version to see the speed difference.

 #include <stdio.h>
#include <time.h>

// Function to calculate the population count (number of set bits) of an integer using __builtin_popcount
int popcount(int num) {
    return __builtin_popcount(num);
}

int main() {
    unsigned int n;
    printf("Enter the value of n: ");
    scanf("%d", &n);

    // Variables to store start and end times
    struct timespec start_time, end_time;

    // Get the current time as the start time
    clock_gettime(CLOCK_MONOTONIC, &start_time);

    int sum = 0;
    for (unsigned int i = 0; i < n; i++) {
        sum += popcount(i);
    }

    // Get the current time as the end time
    clock_gettime(CLOCK_MONOTONIC, &end_time);

    // Calculate the elapsed time in microseconds
    long long elapsed_time = (end_time.tv_sec - start_time.tv_sec) * 1000000LL +
                            (end_time.tv_nsec - start_time.tv_nsec) / 1000;

    printf("Sum of population counts from 0 to %d-1 is: %d\n", n, sum);
    printf("Elapsed time: %lld microseconds\n", elapsed_time);

    return 0;
}

I then compiled this with -march=native -Ofast. I tried both gcc and clang and the results were very similar.

./popcount 
Enter the value of n: 1000000
Sum of population counts from 0 to 1000000-1 is: 9884992
Elapsed time: 732 microseconds

Why is the numba twice as fast as the C code?

Solution

TL;DR: the performance gap between the GCC and the Clang version is due to the use of scalar instructions versus SIMD instructions. The performance gap between the Numba and the Clang version comes from the size of the integers that is not the same between the two version : 64-bit versus 32-bits.

Performance Results

First of all, I am also able to reproduce the problem on my Intel i5-9600KF. Here are the results (and the versions):

Numba 0.56.4:  170.089 ms
Clang 14.0.6:  190.350 ms
GCC 12.2.0:    328.133 ms

To understand what happens, we need to analyze the assembly code produce by all compilers.

Assembly code

Here is the assembly code of the hot loop produced by GCC:

.L5:
    xorl    %edx, %edx
    popcntl %eax, %edx
    incl    %eax
    addl    %edx, %ebx
    cmpl    %ecx, %eax
    jne .L5

Here is the one produced by Clang:

.LBB1_3:                                # =>This Inner Loop Header: Depth=1
    vpand   %ymm5, %ymm0, %ymm12
    vpshufb %ymm12, %ymm6, %ymm12
    vpsrlw  $4, %ymm0, %ymm13
    vpand   %ymm5, %ymm13, %ymm13
    vpshufb %ymm13, %ymm6, %ymm13
    vpaddb  %ymm12, %ymm13, %ymm12
    vpunpckhdq  %ymm1, %ymm12, %ymm13   # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
    vpsadbw %ymm1, %ymm13, %ymm13
    vpunpckldq  %ymm1, %ymm12, %ymm12   # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
    vpsadbw %ymm1, %ymm12, %ymm12
    vpackuswb   %ymm13, %ymm12, %ymm12
    vpaddd  %ymm2, %ymm0, %ymm13
    vpaddd  %ymm12, %ymm8, %ymm8
    vpand   %ymm5, %ymm13, %ymm12
    vpshufb %ymm12, %ymm6, %ymm12
    vpsrlw  $4, %ymm13, %ymm13
    vpand   %ymm5, %ymm13, %ymm13
    vpshufb %ymm13, %ymm6, %ymm13
    vpaddb  %ymm12, %ymm13, %ymm12
    vpunpckhdq  %ymm1, %ymm12, %ymm13   # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
    vpsadbw %ymm1, %ymm13, %ymm13
    vpunpckldq  %ymm1, %ymm12, %ymm12   # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
    vpsadbw %ymm1, %ymm12, %ymm12
    vpackuswb   %ymm13, %ymm12, %ymm12
    vpaddd  %ymm3, %ymm0, %ymm13
    vpaddd  %ymm12, %ymm9, %ymm9
    vpand   %ymm5, %ymm13, %ymm12
    vpshufb %ymm12, %ymm6, %ymm12
    vpsrlw  $4, %ymm13, %ymm13
    vpand   %ymm5, %ymm13, %ymm13
    vpshufb %ymm13, %ymm6, %ymm13
    vpaddb  %ymm12, %ymm13, %ymm12
    vpunpckhdq  %ymm1, %ymm12, %ymm13   # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
    vpsadbw %ymm1, %ymm13, %ymm13
    vpunpckldq  %ymm1, %ymm12, %ymm12   # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
    vpsadbw %ymm1, %ymm12, %ymm12
    vpackuswb   %ymm13, %ymm12, %ymm12
    vpaddd  %ymm4, %ymm0, %ymm13
    vpaddd  %ymm12, %ymm10, %ymm10
    vpand   %ymm5, %ymm13, %ymm12
    vpshufb %ymm12, %ymm6, %ymm12
    vpsrlw  $4, %ymm13, %ymm13
    vpand   %ymm5, %ymm13, %ymm13
    vpshufb %ymm13, %ymm6, %ymm13
    vpaddb  %ymm12, %ymm13, %ymm12
    vpunpckhdq  %ymm1, %ymm12, %ymm13   # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
    vpsadbw %ymm1, %ymm13, %ymm13
    vpunpckldq  %ymm1, %ymm12, %ymm12   # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
    vpsadbw %ymm1, %ymm12, %ymm12
    vpackuswb   %ymm13, %ymm12, %ymm12
    vpaddd  %ymm12, %ymm11, %ymm11
    vpaddd  %ymm7, %ymm0, %ymm0
    addl    $-32, %edx
    jne .LBB1_3

Here is the one produced by Numba:

.LBB0_8:
    vpand   %ymm0, %ymm9, %ymm6
    vpshufb %ymm6, %ymm10, %ymm6
    vpsrlw  $4, %ymm0, %ymm7
    vpand   %ymm7, %ymm9, %ymm7
    vpshufb %ymm7, %ymm10, %ymm7
    vpaddb  %ymm6, %ymm7, %ymm6
    vpaddq  -40(%rsp), %ymm0, %ymm7
    vpsadbw %ymm5, %ymm6, %ymm6
    vpaddq  %ymm6, %ymm1, %ymm1
    vpand   %ymm7, %ymm9, %ymm6
    vpshufb %ymm6, %ymm10, %ymm6
    vpsrlw  $4, %ymm7, %ymm7
    vpand   %ymm7, %ymm9, %ymm7
    vpshufb %ymm7, %ymm10, %ymm7
    vpaddb  %ymm6, %ymm7, %ymm6
    vpaddq  -72(%rsp), %ymm0, %ymm7
    vpsadbw %ymm5, %ymm6, %ymm6
    vpaddq  %ymm6, %ymm2, %ymm2
    vpand   %ymm7, %ymm9, %ymm6
    vpshufb %ymm6, %ymm10, %ymm6
    vpsrlw  $4, %ymm7, %ymm7
    vpand   %ymm7, %ymm9, %ymm7
    vpshufb %ymm7, %ymm10, %ymm7
    vpaddb  %ymm6, %ymm7, %ymm6
    vpaddq  %ymm0, %ymm8, %ymm7
    vpsadbw %ymm5, %ymm6, %ymm6
    vpaddq  %ymm6, %ymm3, %ymm3
    vpand   %ymm7, %ymm9, %ymm6
    vpshufb %ymm6, %ymm10, %ymm6
    vpsrlw  $4, %ymm7, %ymm7
    vpand   %ymm7, %ymm9, %ymm7
    vpshufb %ymm7, %ymm10, %ymm7
    vpaddb  %ymm6, %ymm7, %ymm6
    vpsadbw %ymm5, %ymm6, %ymm6
    vpaddq  %ymm6, %ymm4, %ymm4
    vpaddq  %ymm0, %ymm11, %ymm6
    vpand   %ymm6, %ymm9, %ymm7
    vpshufb %ymm7, %ymm10, %ymm7
    vpsrlw  $4, %ymm6, %ymm6
    vpand   %ymm6, %ymm9, %ymm6
    vpshufb %ymm6, %ymm10, %ymm6
    vpaddb  %ymm7, %ymm6, %ymm6
    vpaddq  %ymm0, %ymm12, %ymm7
    vpsadbw %ymm5, %ymm6, %ymm6
    vpaddq  %ymm6, %ymm1, %ymm1
    vpand   %ymm7, %ymm9, %ymm6
    vpshufb %ymm6, %ymm10, %ymm6
    vpsrlw  $4, %ymm7, %ymm7
    vpand   %ymm7, %ymm9, %ymm7
    vpshufb %ymm7, %ymm10, %ymm7
    vpaddb  %ymm6, %ymm7, %ymm6
    vpaddq  %ymm0, %ymm13, %ymm7
    vpsadbw %ymm5, %ymm6, %ymm6
    vpaddq  %ymm6, %ymm2, %ymm2
    vpand   %ymm7, %ymm9, %ymm6
    vpshufb %ymm6, %ymm10, %ymm6
    vpsrlw  $4, %ymm7, %ymm7
    vpand   %ymm7, %ymm9, %ymm7
    vpshufb %ymm7, %ymm10, %ymm7
    vpaddb  %ymm6, %ymm7, %ymm6
    vpaddq  %ymm0, %ymm14, %ymm7
    vpsadbw %ymm5, %ymm6, %ymm6
    vpaddq  %ymm6, %ymm3, %ymm3
    vpand   %ymm7, %ymm9, %ymm6
    vpshufb %ymm6, %ymm10, %ymm6
    vpsrlw  $4, %ymm7, %ymm7
    vpand   %ymm7, %ymm9, %ymm7
    vpshufb %ymm7, %ymm10, %ymm7
    vpaddb  %ymm6, %ymm7, %ymm6
    vpsadbw %ymm5, %ymm6, %ymm6
    vpaddq  %ymm6, %ymm4, %ymm4
    vpaddq  %ymm0, %ymm15, %ymm0
    addq    $-2, %rbx
    jne .LBB0_8

Analysis

First of all, we can see that the GCC code use the popcntl instruction which is very fast, at least for scalar operations.

Clang generate a assembly code using the AVX-2 SIMD instruction set on my machine. This is why the program produced by Clang is so fast compared to GCC : it operates on many items in parallel thanks to SIMD instructions.

Numba generate a code very similar to Clang. This is not surprising since Numba is based on LLVM-Lite (and so LLVM), while Clang is also based on LLVM. However, there are small differences explaining the performance impact. Indeed, the Numba assembly code operates on twice more items than the Clang counterpart. This can be seen by counting the number of vpsrlw instructions (8 VS 4). I do not expect this to make the difference since the Clang loop is already well unrolled and the benefit of unrolling it more is tiny. Actually, this more aggressive unrolling is a side effect. The key difference is that Numba operates on 64-bit integers while the C code operates on 32-bit integers! This is why Clang unroll the loop differently and generate different instructions. In fact, the smaller integers causes Clang to generate a sequence of instructions to convert integers of different size which is less efficient. IMHO, this is a side effect impacting the optimizer since operating on smaller items can generally be used generate faster SIMD code. The code produced by LLVM seems sub-optimal in this case : it saturates the port 5 (ie. shuffle/permute execution unit) on my machine while one can write a code not saturating it (not easy though).

Faster C implementation

You can fix the C++ implementation so to operate 64-bit integers:

#include <stdio.h>
#include <time.h>
#include <stdint.h>

// Function to calculate the population count (number of set bits) of an integer using __builtin_popcount
uint64_t popcount(uint64_t num) {
    return __builtin_popcountl(num);
}

int main() {
    int64_t n;
    printf("Enter the value of n: ");
    scanf("%ld", &n);

    // Variables to store start and end times
    struct timespec start_time, end_time;

    // Get the current time as the start time
    clock_gettime(CLOCK_MONOTONIC, &start_time);

    int64_t sum = 0;
    for (int64_t i = 0; i < n; i++) {
        sum += popcount(i);
    }

    // Get the current time as the end time
    clock_gettime(CLOCK_MONOTONIC, &end_time);

    // Calculate the elapsed time in microseconds
    long long elapsed_time = (end_time.tv_sec - start_time.tv_sec) * 1000000LL +
                            (end_time.tv_nsec - start_time.tv_nsec) / 1000;

    printf("Sum of population counts from 0 to %ld-1 is: %ld\n", n, sum);
    printf("Elapsed time: %lld microseconds\n", elapsed_time);

    return 0;
}

This produces a program as fast as Numba on my machine using Clang (GCC still generate a slow scalar implementation).

Notes

SIMD versions only make sense if your real-world code is SIMD-friendly, that is if popcount can be applied on multiple contiguous items. Otherwise, results of the scalar implementation can be drastically different (in fact, the three compilers generate a very close code which I expect to be equally fast).

AVX-512 provides the SIMD instruction VPOPCNTDQ that should clearly outperforms the code generated by LLVM using (only) AVX-2.. Since I do not have AVX-512 on my machine, and AVX-2 does not provide such an instruction, it makes sense of LLVM to produce an assembly code using AVX-2. The AVX-512 instruction can count the number of 1 in 16 x 32-bit integers in parallel while taking about the same number of cycle than its scalar counterpart. To be more precise, the instruction is only available using the instruction set AVX512VPOPCNTDQ + AVX512VL (which AFAIK is not available on all CPU supporting AVX-512). As of now, this instruction is only available on few x86-64 micro-architectures (eg. Intel Ice-Lake, Intel Sapphire Rapids and AMD Zen4).