Search code examples
pythoncperformancex86-64numba

Why is numba popcount code twice as fast as equivalent C code?


I have this simple python/numba code:

from numba import njit
import numba as nb
@nb.njit(nb.uint64(nb.uint64))
def popcount(x): 
      b=0
      while(x > 0):
          x &= x - nb.uint64(1)   
          b+=1
      return b
@njit
def timed_loop(n):
    summand = 0
    for i in range(n):
        summand += popcount(i)
    return summand

It just adds the popcounts for integers 0 to n - 1.

When I time it I get:

%timeit timed_loop(1000000)
340 µs ± 1.08 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

It turns out that llvm cleverly converts the popcount function into the native CPU POPCNT instruction so we should expect it to be fast. But the question is, how fast.

I thought I would compare it to a C version to see the speed difference.

 #include <stdio.h>
#include <time.h>

// Function to calculate the population count (number of set bits) of an integer using __builtin_popcount
int popcount(int num) {
    return __builtin_popcount(num);
}

int main() {
    unsigned int n;
    printf("Enter the value of n: ");
    scanf("%d", &n);

    // Variables to store start and end times
    struct timespec start_time, end_time;

    // Get the current time as the start time
    clock_gettime(CLOCK_MONOTONIC, &start_time);

    int sum = 0;
    for (unsigned int i = 0; i < n; i++) {
        sum += popcount(i);
    }

    // Get the current time as the end time
    clock_gettime(CLOCK_MONOTONIC, &end_time);

    // Calculate the elapsed time in microseconds
    long long elapsed_time = (end_time.tv_sec - start_time.tv_sec) * 1000000LL +
                            (end_time.tv_nsec - start_time.tv_nsec) / 1000;

    printf("Sum of population counts from 0 to %d-1 is: %d\n", n, sum);
    printf("Elapsed time: %lld microseconds\n", elapsed_time);

    return 0;
}

I then compiled this with -march=native -Ofast. I tried both gcc and clang and the results were very similar.

./popcount 
Enter the value of n: 1000000
Sum of population counts from 0 to 1000000-1 is: 9884992
Elapsed time: 732 microseconds

Why is the numba twice as fast as the C code?


Solution

  • TL;DR: the performance gap between the GCC and the Clang version is due to the use of scalar instructions versus SIMD instructions. The performance gap between the Numba and the Clang version comes from the size of the integers that is not the same between the two version : 64-bit versus 32-bits.


    Performance Results

    First of all, I am also able to reproduce the problem on my Intel i5-9600KF. Here are the results (and the versions):

    Numba 0.56.4:  170.089 ms
    Clang 14.0.6:  190.350 ms
    GCC 12.2.0:    328.133 ms
    

    To understand what happens, we need to analyze the assembly code produce by all compilers.


    Assembly code

    Here is the assembly code of the hot loop produced by GCC:

    .L5:
        xorl    %edx, %edx
        popcntl %eax, %edx
        incl    %eax
        addl    %edx, %ebx
        cmpl    %ecx, %eax
        jne .L5
    

    Here is the one produced by Clang:

    .LBB1_3:                                # =>This Inner Loop Header: Depth=1
        vpand   %ymm5, %ymm0, %ymm12
        vpshufb %ymm12, %ymm6, %ymm12
        vpsrlw  $4, %ymm0, %ymm13
        vpand   %ymm5, %ymm13, %ymm13
        vpshufb %ymm13, %ymm6, %ymm13
        vpaddb  %ymm12, %ymm13, %ymm12
        vpunpckhdq  %ymm1, %ymm12, %ymm13   # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
        vpsadbw %ymm1, %ymm13, %ymm13
        vpunpckldq  %ymm1, %ymm12, %ymm12   # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
        vpsadbw %ymm1, %ymm12, %ymm12
        vpackuswb   %ymm13, %ymm12, %ymm12
        vpaddd  %ymm2, %ymm0, %ymm13
        vpaddd  %ymm12, %ymm8, %ymm8
        vpand   %ymm5, %ymm13, %ymm12
        vpshufb %ymm12, %ymm6, %ymm12
        vpsrlw  $4, %ymm13, %ymm13
        vpand   %ymm5, %ymm13, %ymm13
        vpshufb %ymm13, %ymm6, %ymm13
        vpaddb  %ymm12, %ymm13, %ymm12
        vpunpckhdq  %ymm1, %ymm12, %ymm13   # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
        vpsadbw %ymm1, %ymm13, %ymm13
        vpunpckldq  %ymm1, %ymm12, %ymm12   # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
        vpsadbw %ymm1, %ymm12, %ymm12
        vpackuswb   %ymm13, %ymm12, %ymm12
        vpaddd  %ymm3, %ymm0, %ymm13
        vpaddd  %ymm12, %ymm9, %ymm9
        vpand   %ymm5, %ymm13, %ymm12
        vpshufb %ymm12, %ymm6, %ymm12
        vpsrlw  $4, %ymm13, %ymm13
        vpand   %ymm5, %ymm13, %ymm13
        vpshufb %ymm13, %ymm6, %ymm13
        vpaddb  %ymm12, %ymm13, %ymm12
        vpunpckhdq  %ymm1, %ymm12, %ymm13   # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
        vpsadbw %ymm1, %ymm13, %ymm13
        vpunpckldq  %ymm1, %ymm12, %ymm12   # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
        vpsadbw %ymm1, %ymm12, %ymm12
        vpackuswb   %ymm13, %ymm12, %ymm12
        vpaddd  %ymm4, %ymm0, %ymm13
        vpaddd  %ymm12, %ymm10, %ymm10
        vpand   %ymm5, %ymm13, %ymm12
        vpshufb %ymm12, %ymm6, %ymm12
        vpsrlw  $4, %ymm13, %ymm13
        vpand   %ymm5, %ymm13, %ymm13
        vpshufb %ymm13, %ymm6, %ymm13
        vpaddb  %ymm12, %ymm13, %ymm12
        vpunpckhdq  %ymm1, %ymm12, %ymm13   # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
        vpsadbw %ymm1, %ymm13, %ymm13
        vpunpckldq  %ymm1, %ymm12, %ymm12   # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
        vpsadbw %ymm1, %ymm12, %ymm12
        vpackuswb   %ymm13, %ymm12, %ymm12
        vpaddd  %ymm12, %ymm11, %ymm11
        vpaddd  %ymm7, %ymm0, %ymm0
        addl    $-32, %edx
        jne .LBB1_3
    

    Here is the one produced by Numba:

    .LBB0_8:
        vpand   %ymm0, %ymm9, %ymm6
        vpshufb %ymm6, %ymm10, %ymm6
        vpsrlw  $4, %ymm0, %ymm7
        vpand   %ymm7, %ymm9, %ymm7
        vpshufb %ymm7, %ymm10, %ymm7
        vpaddb  %ymm6, %ymm7, %ymm6
        vpaddq  -40(%rsp), %ymm0, %ymm7
        vpsadbw %ymm5, %ymm6, %ymm6
        vpaddq  %ymm6, %ymm1, %ymm1
        vpand   %ymm7, %ymm9, %ymm6
        vpshufb %ymm6, %ymm10, %ymm6
        vpsrlw  $4, %ymm7, %ymm7
        vpand   %ymm7, %ymm9, %ymm7
        vpshufb %ymm7, %ymm10, %ymm7
        vpaddb  %ymm6, %ymm7, %ymm6
        vpaddq  -72(%rsp), %ymm0, %ymm7
        vpsadbw %ymm5, %ymm6, %ymm6
        vpaddq  %ymm6, %ymm2, %ymm2
        vpand   %ymm7, %ymm9, %ymm6
        vpshufb %ymm6, %ymm10, %ymm6
        vpsrlw  $4, %ymm7, %ymm7
        vpand   %ymm7, %ymm9, %ymm7
        vpshufb %ymm7, %ymm10, %ymm7
        vpaddb  %ymm6, %ymm7, %ymm6
        vpaddq  %ymm0, %ymm8, %ymm7
        vpsadbw %ymm5, %ymm6, %ymm6
        vpaddq  %ymm6, %ymm3, %ymm3
        vpand   %ymm7, %ymm9, %ymm6
        vpshufb %ymm6, %ymm10, %ymm6
        vpsrlw  $4, %ymm7, %ymm7
        vpand   %ymm7, %ymm9, %ymm7
        vpshufb %ymm7, %ymm10, %ymm7
        vpaddb  %ymm6, %ymm7, %ymm6
        vpsadbw %ymm5, %ymm6, %ymm6
        vpaddq  %ymm6, %ymm4, %ymm4
        vpaddq  %ymm0, %ymm11, %ymm6
        vpand   %ymm6, %ymm9, %ymm7
        vpshufb %ymm7, %ymm10, %ymm7
        vpsrlw  $4, %ymm6, %ymm6
        vpand   %ymm6, %ymm9, %ymm6
        vpshufb %ymm6, %ymm10, %ymm6
        vpaddb  %ymm7, %ymm6, %ymm6
        vpaddq  %ymm0, %ymm12, %ymm7
        vpsadbw %ymm5, %ymm6, %ymm6
        vpaddq  %ymm6, %ymm1, %ymm1
        vpand   %ymm7, %ymm9, %ymm6
        vpshufb %ymm6, %ymm10, %ymm6
        vpsrlw  $4, %ymm7, %ymm7
        vpand   %ymm7, %ymm9, %ymm7
        vpshufb %ymm7, %ymm10, %ymm7
        vpaddb  %ymm6, %ymm7, %ymm6
        vpaddq  %ymm0, %ymm13, %ymm7
        vpsadbw %ymm5, %ymm6, %ymm6
        vpaddq  %ymm6, %ymm2, %ymm2
        vpand   %ymm7, %ymm9, %ymm6
        vpshufb %ymm6, %ymm10, %ymm6
        vpsrlw  $4, %ymm7, %ymm7
        vpand   %ymm7, %ymm9, %ymm7
        vpshufb %ymm7, %ymm10, %ymm7
        vpaddb  %ymm6, %ymm7, %ymm6
        vpaddq  %ymm0, %ymm14, %ymm7
        vpsadbw %ymm5, %ymm6, %ymm6
        vpaddq  %ymm6, %ymm3, %ymm3
        vpand   %ymm7, %ymm9, %ymm6
        vpshufb %ymm6, %ymm10, %ymm6
        vpsrlw  $4, %ymm7, %ymm7
        vpand   %ymm7, %ymm9, %ymm7
        vpshufb %ymm7, %ymm10, %ymm7
        vpaddb  %ymm6, %ymm7, %ymm6
        vpsadbw %ymm5, %ymm6, %ymm6
        vpaddq  %ymm6, %ymm4, %ymm4
        vpaddq  %ymm0, %ymm15, %ymm0
        addq    $-2, %rbx
        jne .LBB0_8
    

    Analysis

    First of all, we can see that the GCC code use the popcntl instruction which is very fast, at least for scalar operations.

    Clang generate a assembly code using the AVX-2 SIMD instruction set on my machine. This is why the program produced by Clang is so fast compared to GCC : it operates on many items in parallel thanks to SIMD instructions.

    Numba generate a code very similar to Clang. This is not surprising since Numba is based on LLVM-Lite (and so LLVM), while Clang is also based on LLVM. However, there are small differences explaining the performance impact. Indeed, the Numba assembly code operates on twice more items than the Clang counterpart. This can be seen by counting the number of vpsrlw instructions (8 VS 4). I do not expect this to make the difference since the Clang loop is already well unrolled and the benefit of unrolling it more is tiny. Actually, this more aggressive unrolling is a side effect. The key difference is that Numba operates on 64-bit integers while the C code operates on 32-bit integers! This is why Clang unroll the loop differently and generate different instructions. In fact, the smaller integers causes Clang to generate a sequence of instructions to convert integers of different size which is less efficient. IMHO, this is a side effect impacting the optimizer since operating on smaller items can generally be used generate faster SIMD code. The code produced by LLVM seems sub-optimal in this case : it saturates the port 5 (ie. shuffle/permute execution unit) on my machine while one can write a code not saturating it (not easy though).


    Faster C implementation

    You can fix the C++ implementation so to operate 64-bit integers:

    #include <stdio.h>
    #include <time.h>
    #include <stdint.h>
    
    // Function to calculate the population count (number of set bits) of an integer using __builtin_popcount
    uint64_t popcount(uint64_t num) {
        return __builtin_popcountl(num);
    }
    
    int main() {
        int64_t n;
        printf("Enter the value of n: ");
        scanf("%ld", &n);
    
        // Variables to store start and end times
        struct timespec start_time, end_time;
    
        // Get the current time as the start time
        clock_gettime(CLOCK_MONOTONIC, &start_time);
    
        int64_t sum = 0;
        for (int64_t i = 0; i < n; i++) {
            sum += popcount(i);
        }
    
        // Get the current time as the end time
        clock_gettime(CLOCK_MONOTONIC, &end_time);
    
        // Calculate the elapsed time in microseconds
        long long elapsed_time = (end_time.tv_sec - start_time.tv_sec) * 1000000LL +
                                (end_time.tv_nsec - start_time.tv_nsec) / 1000;
    
        printf("Sum of population counts from 0 to %ld-1 is: %ld\n", n, sum);
        printf("Elapsed time: %lld microseconds\n", elapsed_time);
    
        return 0;
    }
    

    This produces a program as fast as Numba on my machine using Clang (GCC still generate a slow scalar implementation).


    Notes

    SIMD versions only make sense if your real-world code is SIMD-friendly, that is if popcount can be applied on multiple contiguous items. Otherwise, results of the scalar implementation can be drastically different (in fact, the three compilers generate a very close code which I expect to be equally fast).

    AVX-512 provides the SIMD instruction VPOPCNTDQ that should clearly outperforms the code generated by LLVM using (only) AVX-2.. Since I do not have AVX-512 on my machine, and AVX-2 does not provide such an instruction, it makes sense of LLVM to produce an assembly code using AVX-2. The AVX-512 instruction can count the number of 1 in 16 x 32-bit integers in parallel while taking about the same number of cycle than its scalar counterpart. To be more precise, the instruction is only available using the instruction set AVX512VPOPCNTDQ + AVX512VL (which AFAIK is not available on all CPU supporting AVX-512). As of now, this instruction is only available on few x86-64 micro-architectures (eg. Intel Ice-Lake, Intel Sapphire Rapids and AMD Zen4).