I have a simple vector-vector addition algorithm implementation in assembly. It uses AVX to read 4 doubles from the A vector, and 4 doubles from B vector. The algorithm adds these numbers and writes the result back to the C vector. If I use vmovntpd to write back the result, the performance becames extremely random. I have made this test on an azure server, with Intel Xeon Platinum 8168 CPU. If I run this test on my laptop (Intel Core i7-2640M CPU), this random effect disappears. What is the problem on the server? One more info: The server has 44 CPU-s.
[Edit] Here is my code:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Dense to dense
;; Without cache (for storing the result)
;; AVX-512
;; Without tolerances
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
global _denseToDenseAddAVX512_nocache_64_linux
_denseToDenseAddAVX512_nocache_64_linux:
push rbp
mov rbp, rsp
; c = a + lambda * b
; rdi: address1
; rsi: address2
; rdx: address3
; rcx: count
; xmm0: lambda
mov rax, rcx
shr rcx, 4
and rax, 0x0F
vzeroupper
vmovupd zmm5, [abs_mask]
sub rsp, 8
movlpd [rbp - 8], xmm0
vbroadcastsd zmm7, [rbp - 8]
vmovapd zmm6, zmm7
cmp rcx, 0
je after_loop_denseToDenseAddAVX512_nocache_64_linux
start_denseToDenseAddAVX512_nocache_64_linux:
vmovapd zmm0, [rdi] ; a
vmovapd zmm1, zmm7
vmulpd zmm1, zmm1, [rsi] ; b
vaddpd zmm0, zmm0, zmm1 ; zmm0 = c = a + b
vmovntpd [rdx], zmm0
vmovapd zmm2, [rdi + 64] ; a
vmovapd zmm3, zmm6
vmulpd zmm3, zmm3, [rsi + 64] ; b
vaddpd zmm2, zmm2, zmm3 ; zmm2 = c = a + b
vmovntpd [rdx + 64], zmm2
add rdi, 128
add rsi, 128
add rdx, 128
loop start_denseToDenseAddAVX512_nocache_64_linux
after_loop_denseToDenseAddAVX512_nocache_64_linux:
cmp rax, 0
je end_denseToDenseAddAVX512_nocache_64_linux
mov rcx, rax
last_loop_denseToDenseAddAVX512_nocache_64_linux:
movlpd xmm0, [rdi] ; a
movapd xmm1, xmm7
mulsd xmm1, [rsi] ; b
addsd xmm0, xmm1 ; xmm0 = c = a + b
movlpd [rdx], xmm0
add rdi, 8
add rsi, 8
add rdx, 8
loop last_loop_denseToDenseAddAVX512_nocache_64_linux
end_denseToDenseAddAVX512_nocache_64_linux:
mov rsp, rbp
pop rbp
ret
Okay, I've found the solution! This is a NUMA architecture with 44 CPUs, so I disabled the NUMA, and I've limited the number of online cpu-s to 1 with the following kernel parameters: numa=off maxcpus=1 nr_cpus=1.