I have this simple python/numba code:
from numba import njit
import numba as nb
@nb.njit(nb.uint64(nb.uint64))
def popcount(x):
b=0
while(x > 0):
x &= x - nb.uint64(1)
b+=1
return b
@njit
def timed_loop(n):
summand = 0
for i in range(n):
summand += popcount(i)
return summand
It just adds the popcounts for integers 0 to n - 1.
When I time it I get:
%timeit timed_loop(1000000)
340 µs ± 1.08 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
It turns out that llvm cleverly converts the popcount function into the native CPU POPCNT instruction so we should expect it to be fast. But the question is, how fast.
I thought I would compare it to a C version to see the speed difference.
#include <stdio.h>
#include <time.h>
// Function to calculate the population count (number of set bits) of an integer using __builtin_popcount
int popcount(int num) {
return __builtin_popcount(num);
}
int main() {
unsigned int n;
printf("Enter the value of n: ");
scanf("%d", &n);
// Variables to store start and end times
struct timespec start_time, end_time;
// Get the current time as the start time
clock_gettime(CLOCK_MONOTONIC, &start_time);
int sum = 0;
for (unsigned int i = 0; i < n; i++) {
sum += popcount(i);
}
// Get the current time as the end time
clock_gettime(CLOCK_MONOTONIC, &end_time);
// Calculate the elapsed time in microseconds
long long elapsed_time = (end_time.tv_sec - start_time.tv_sec) * 1000000LL +
(end_time.tv_nsec - start_time.tv_nsec) / 1000;
printf("Sum of population counts from 0 to %d-1 is: %d\n", n, sum);
printf("Elapsed time: %lld microseconds\n", elapsed_time);
return 0;
}
I then compiled this with -march=native -Ofast
. I tried both gcc and clang and the results were very similar.
./popcount
Enter the value of n: 1000000
Sum of population counts from 0 to 1000000-1 is: 9884992
Elapsed time: 732 microseconds
Why is the numba twice as fast as the C code?
TL;DR: the performance gap between the GCC and the Clang version is due to the use of scalar instructions versus SIMD instructions. The performance gap between the Numba and the Clang version comes from the size of the integers that is not the same between the two version : 64-bit versus 32-bits.
First of all, I am also able to reproduce the problem on my Intel i5-9600KF. Here are the results (and the versions):
Numba 0.56.4: 170.089 ms
Clang 14.0.6: 190.350 ms
GCC 12.2.0: 328.133 ms
To understand what happens, we need to analyze the assembly code produce by all compilers.
Here is the assembly code of the hot loop produced by GCC:
.L5:
xorl %edx, %edx
popcntl %eax, %edx
incl %eax
addl %edx, %ebx
cmpl %ecx, %eax
jne .L5
Here is the one produced by Clang:
.LBB1_3: # =>This Inner Loop Header: Depth=1
vpand %ymm5, %ymm0, %ymm12
vpshufb %ymm12, %ymm6, %ymm12
vpsrlw $4, %ymm0, %ymm13
vpand %ymm5, %ymm13, %ymm13
vpshufb %ymm13, %ymm6, %ymm13
vpaddb %ymm12, %ymm13, %ymm12
vpunpckhdq %ymm1, %ymm12, %ymm13 # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
vpsadbw %ymm1, %ymm13, %ymm13
vpunpckldq %ymm1, %ymm12, %ymm12 # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
vpsadbw %ymm1, %ymm12, %ymm12
vpackuswb %ymm13, %ymm12, %ymm12
vpaddd %ymm2, %ymm0, %ymm13
vpaddd %ymm12, %ymm8, %ymm8
vpand %ymm5, %ymm13, %ymm12
vpshufb %ymm12, %ymm6, %ymm12
vpsrlw $4, %ymm13, %ymm13
vpand %ymm5, %ymm13, %ymm13
vpshufb %ymm13, %ymm6, %ymm13
vpaddb %ymm12, %ymm13, %ymm12
vpunpckhdq %ymm1, %ymm12, %ymm13 # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
vpsadbw %ymm1, %ymm13, %ymm13
vpunpckldq %ymm1, %ymm12, %ymm12 # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
vpsadbw %ymm1, %ymm12, %ymm12
vpackuswb %ymm13, %ymm12, %ymm12
vpaddd %ymm3, %ymm0, %ymm13
vpaddd %ymm12, %ymm9, %ymm9
vpand %ymm5, %ymm13, %ymm12
vpshufb %ymm12, %ymm6, %ymm12
vpsrlw $4, %ymm13, %ymm13
vpand %ymm5, %ymm13, %ymm13
vpshufb %ymm13, %ymm6, %ymm13
vpaddb %ymm12, %ymm13, %ymm12
vpunpckhdq %ymm1, %ymm12, %ymm13 # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
vpsadbw %ymm1, %ymm13, %ymm13
vpunpckldq %ymm1, %ymm12, %ymm12 # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
vpsadbw %ymm1, %ymm12, %ymm12
vpackuswb %ymm13, %ymm12, %ymm12
vpaddd %ymm4, %ymm0, %ymm13
vpaddd %ymm12, %ymm10, %ymm10
vpand %ymm5, %ymm13, %ymm12
vpshufb %ymm12, %ymm6, %ymm12
vpsrlw $4, %ymm13, %ymm13
vpand %ymm5, %ymm13, %ymm13
vpshufb %ymm13, %ymm6, %ymm13
vpaddb %ymm12, %ymm13, %ymm12
vpunpckhdq %ymm1, %ymm12, %ymm13 # ymm13 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7]
vpsadbw %ymm1, %ymm13, %ymm13
vpunpckldq %ymm1, %ymm12, %ymm12 # ymm12 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5]
vpsadbw %ymm1, %ymm12, %ymm12
vpackuswb %ymm13, %ymm12, %ymm12
vpaddd %ymm12, %ymm11, %ymm11
vpaddd %ymm7, %ymm0, %ymm0
addl $-32, %edx
jne .LBB1_3
Here is the one produced by Numba:
.LBB0_8:
vpand %ymm0, %ymm9, %ymm6
vpshufb %ymm6, %ymm10, %ymm6
vpsrlw $4, %ymm0, %ymm7
vpand %ymm7, %ymm9, %ymm7
vpshufb %ymm7, %ymm10, %ymm7
vpaddb %ymm6, %ymm7, %ymm6
vpaddq -40(%rsp), %ymm0, %ymm7
vpsadbw %ymm5, %ymm6, %ymm6
vpaddq %ymm6, %ymm1, %ymm1
vpand %ymm7, %ymm9, %ymm6
vpshufb %ymm6, %ymm10, %ymm6
vpsrlw $4, %ymm7, %ymm7
vpand %ymm7, %ymm9, %ymm7
vpshufb %ymm7, %ymm10, %ymm7
vpaddb %ymm6, %ymm7, %ymm6
vpaddq -72(%rsp), %ymm0, %ymm7
vpsadbw %ymm5, %ymm6, %ymm6
vpaddq %ymm6, %ymm2, %ymm2
vpand %ymm7, %ymm9, %ymm6
vpshufb %ymm6, %ymm10, %ymm6
vpsrlw $4, %ymm7, %ymm7
vpand %ymm7, %ymm9, %ymm7
vpshufb %ymm7, %ymm10, %ymm7
vpaddb %ymm6, %ymm7, %ymm6
vpaddq %ymm0, %ymm8, %ymm7
vpsadbw %ymm5, %ymm6, %ymm6
vpaddq %ymm6, %ymm3, %ymm3
vpand %ymm7, %ymm9, %ymm6
vpshufb %ymm6, %ymm10, %ymm6
vpsrlw $4, %ymm7, %ymm7
vpand %ymm7, %ymm9, %ymm7
vpshufb %ymm7, %ymm10, %ymm7
vpaddb %ymm6, %ymm7, %ymm6
vpsadbw %ymm5, %ymm6, %ymm6
vpaddq %ymm6, %ymm4, %ymm4
vpaddq %ymm0, %ymm11, %ymm6
vpand %ymm6, %ymm9, %ymm7
vpshufb %ymm7, %ymm10, %ymm7
vpsrlw $4, %ymm6, %ymm6
vpand %ymm6, %ymm9, %ymm6
vpshufb %ymm6, %ymm10, %ymm6
vpaddb %ymm7, %ymm6, %ymm6
vpaddq %ymm0, %ymm12, %ymm7
vpsadbw %ymm5, %ymm6, %ymm6
vpaddq %ymm6, %ymm1, %ymm1
vpand %ymm7, %ymm9, %ymm6
vpshufb %ymm6, %ymm10, %ymm6
vpsrlw $4, %ymm7, %ymm7
vpand %ymm7, %ymm9, %ymm7
vpshufb %ymm7, %ymm10, %ymm7
vpaddb %ymm6, %ymm7, %ymm6
vpaddq %ymm0, %ymm13, %ymm7
vpsadbw %ymm5, %ymm6, %ymm6
vpaddq %ymm6, %ymm2, %ymm2
vpand %ymm7, %ymm9, %ymm6
vpshufb %ymm6, %ymm10, %ymm6
vpsrlw $4, %ymm7, %ymm7
vpand %ymm7, %ymm9, %ymm7
vpshufb %ymm7, %ymm10, %ymm7
vpaddb %ymm6, %ymm7, %ymm6
vpaddq %ymm0, %ymm14, %ymm7
vpsadbw %ymm5, %ymm6, %ymm6
vpaddq %ymm6, %ymm3, %ymm3
vpand %ymm7, %ymm9, %ymm6
vpshufb %ymm6, %ymm10, %ymm6
vpsrlw $4, %ymm7, %ymm7
vpand %ymm7, %ymm9, %ymm7
vpshufb %ymm7, %ymm10, %ymm7
vpaddb %ymm6, %ymm7, %ymm6
vpsadbw %ymm5, %ymm6, %ymm6
vpaddq %ymm6, %ymm4, %ymm4
vpaddq %ymm0, %ymm15, %ymm0
addq $-2, %rbx
jne .LBB0_8
First of all, we can see that the GCC code use the popcntl
instruction which is very fast, at least for scalar operations.
Clang generate a assembly code using the AVX-2 SIMD instruction set on my machine. This is why the program produced by Clang is so fast compared to GCC : it operates on many items in parallel thanks to SIMD instructions.
Numba generate a code very similar to Clang. This is not surprising since Numba is based on LLVM-Lite (and so LLVM), while Clang is also based on LLVM. However, there are small differences explaining the performance impact. Indeed, the Numba assembly code operates on twice more items than the Clang counterpart. This can be seen by counting the number of vpsrlw
instructions (8 VS 4). I do not expect this to make the difference since the Clang loop is already well unrolled and the benefit of unrolling it more is tiny. Actually, this more aggressive unrolling is a side effect. The key difference is that Numba operates on 64-bit integers while the C code operates on 32-bit integers! This is why Clang unroll the loop differently and generate different instructions. In fact, the smaller integers causes Clang to generate a sequence of instructions to convert integers of different size which is less efficient. IMHO, this is a side effect impacting the optimizer since operating on smaller items can generally be used generate faster SIMD code. The code produced by LLVM seems sub-optimal in this case : it saturates the port 5 (ie. shuffle/permute execution unit) on my machine while one can write a code not saturating it (not easy though).
You can fix the C++ implementation so to operate 64-bit integers:
#include <stdio.h>
#include <time.h>
#include <stdint.h>
// Function to calculate the population count (number of set bits) of an integer using __builtin_popcount
uint64_t popcount(uint64_t num) {
return __builtin_popcountl(num);
}
int main() {
int64_t n;
printf("Enter the value of n: ");
scanf("%ld", &n);
// Variables to store start and end times
struct timespec start_time, end_time;
// Get the current time as the start time
clock_gettime(CLOCK_MONOTONIC, &start_time);
int64_t sum = 0;
for (int64_t i = 0; i < n; i++) {
sum += popcount(i);
}
// Get the current time as the end time
clock_gettime(CLOCK_MONOTONIC, &end_time);
// Calculate the elapsed time in microseconds
long long elapsed_time = (end_time.tv_sec - start_time.tv_sec) * 1000000LL +
(end_time.tv_nsec - start_time.tv_nsec) / 1000;
printf("Sum of population counts from 0 to %ld-1 is: %ld\n", n, sum);
printf("Elapsed time: %lld microseconds\n", elapsed_time);
return 0;
}
This produces a program as fast as Numba on my machine using Clang (GCC still generate a slow scalar implementation).
SIMD versions only make sense if your real-world code is SIMD-friendly, that is if popcount
can be applied on multiple contiguous items. Otherwise, results of the scalar implementation can be drastically different (in fact, the three compilers generate a very close code which I expect to be equally fast).
AVX-512 provides the SIMD instruction VPOPCNTDQ
that should clearly outperforms the code generated by LLVM using (only) AVX-2.. Since I do not have AVX-512 on my machine, and AVX-2 does not provide such an instruction, it makes sense of LLVM to produce an assembly code using AVX-2. The AVX-512 instruction can count the number of 1 in 16 x 32-bit integers in parallel while taking about the same number of cycle than its scalar counterpart. To be more precise, the instruction is only available using the instruction set AVX512VPOPCNTDQ
+ AVX512VL
(which AFAIK is not available on all CPU supporting AVX-512). As of now, this instruction is only available on few x86-64 micro-architectures (eg. Intel Ice-Lake, Intel Sapphire Rapids and AMD Zen4).