Consider following examples for calculating sum of i32 array:
Example1: Simple for loop
pub fn vec_sum_for_loop_i32(src: &[i32]) -> i32 {
let mut sum = 0;
for c in src {
sum += *c;
}
sum
}
Example2: Explicit SIMD sum:
use std::arch::x86_64::*;
// #[inline]
pub fn vec_sum_simd_direct_loop(src: &[i32]) -> i32 {
#[cfg(debug_assertions)]
assert!(src.as_ptr() as u64 % 64 == 0);
#[cfg(debug_assertions)]
assert!(src.len() % (std::mem::size_of::<__m256i>() / std::mem::size_of::<i32>()) == 0);
let p_src = src.as_ptr();
let batch_size = std::mem::size_of::<__m256i>() / std::mem::size_of::<i32>();
#[cfg(debug_assertions)]
assert!(src.len() % batch_size == 0);
let result: i32;
unsafe {
let mut offset: isize = 0;
let total: isize = src.len() as isize;
let mut curr_sum = _mm256_setzero_si256();
while offset < total {
let curr = _mm256_load_epi32(p_src.offset(offset));
curr_sum = _mm256_add_epi32(curr_sum, curr);
offset += 8;
}
// this can be reduced with hadd.
let a0 = _mm256_extract_epi32::<0>(curr_sum);
let a1 = _mm256_extract_epi32::<1>(curr_sum);
let a2 = _mm256_extract_epi32::<2>(curr_sum);
let a3 = _mm256_extract_epi32::<3>(curr_sum);
let a4 = _mm256_extract_epi32::<4>(curr_sum);
let a5 = _mm256_extract_epi32::<5>(curr_sum);
let a6 = _mm256_extract_epi32::<6>(curr_sum);
let a7 = _mm256_extract_epi32::<7>(curr_sum);
result = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7;
}
result
}
When I tried to benchmark the code the first example got ~23GB/s (which is close to theoretical maximum for my RAM speed). Second example got 8GB/s.
When looking at the assembly with cargo asm first example translates into unrolled SIMD optimized loops:
.LBB11_7:
sum += *c;
movdqu xmm2, xmmword, ptr, [rcx, +, 4*rax]
paddd xmm2, xmm0
movdqu xmm0, xmmword, ptr, [rcx, +, 4*rax, +, 16]
paddd xmm0, xmm1
movdqu xmm1, xmmword, ptr, [rcx, +, 4*rax, +, 32]
movdqu xmm3, xmmword, ptr, [rcx, +, 4*rax, +, 48]
movdqu xmm4, xmmword, ptr, [rcx, +, 4*rax, +, 64]
paddd xmm4, xmm1
paddd xmm4, xmm2
movdqu xmm2, xmmword, ptr, [rcx, +, 4*rax, +, 80]
paddd xmm2, xmm3
paddd xmm2, xmm0
movdqu xmm0, xmmword, ptr, [rcx, +, 4*rax, +, 96]
paddd xmm0, xmm4
movdqu xmm1, xmmword, ptr, [rcx, +, 4*rax, +, 112]
paddd xmm1, xmm2
add rax, 32
add r11, -4
jne .LBB11_7
.LBB11_8:
test r10, r10
je .LBB11_11
lea r11, [rcx, +, 4*rax]
add r11, 16
shl r10, 5
xor eax, eax
Second example doesn't have any loop unrolling and doesn't even inline code to _mm256_add_epi32:
...
movaps xmmword, ptr, [rbp, +, 320], xmm7
movaps xmmword, ptr, [rbp, +, 304], xmm6
and rsp, -32
mov r12, rdx
mov rdi, rcx
lea rcx, [rsp, +, 32]
let mut curr_sum = _mm256_setzero_si256();
call core::core_arch::x86::avx::_mm256_setzero_si256
movaps xmm6, xmmword, ptr, [rsp, +, 32]
movaps xmm7, xmmword, ptr, [rsp, +, 48]
while offset < total {
test r12, r12
jle .LBB13_3
xor esi, esi
lea rbx, [rsp, +, 384]
lea r14, [rsp, +, 64]
lea r15, [rsp, +, 96]
.LBB13_2:
let curr = _mm256_load_epi32(p_src.offset(offset));
mov rcx, rbx
mov rdx, rdi
call core::core_arch::x86::avx512f::_mm256_load_epi32
curr_sum = _mm256_add_epi32(curr_sum, curr);
movaps xmmword, ptr, [rsp, +, 112], xmm7
movaps xmmword, ptr, [rsp, +, 96], xmm6
mov rcx, r14
mov rdx, r15
mov r8, rbx
call core::core_arch::x86::avx2::_mm256_add_epi32
movaps xmm6, xmmword, ptr, [rsp, +, 64]
movaps xmm7, xmmword, ptr, [rsp, +, 80]
offset += 8;
add rsi, 8
while offset < total {
add rdi, 32
cmp rsi, r12
...
This of course is pretty trivial example and I don't plan to use hand crafted SIMD for simple sum. But it still puzzles me on why explicit SIMD is so slow and why using SIMD intrinsics led to such unoptimized code.
It appears you forgot to tell rustc it was allowed to use AVX2 instructions everywhere, so it couldn't inline those functions. Instead, you get a total disaster where only the wrapper functions are compiled as AVX2-using functions, or something like that.
Works fine for me with -O -C target-cpu=skylake-avx512
(https://godbolt.org/z/csY5or43T) so it can inline even the AVX512VL load you used, _mm256_load_epi32
1, and then optimize it into a memory source operand for vpaddd ymm0, ymm0, ymmword ptr [rdi + 4*rax]
(AVX2) inside a tight loop.
In GCC / clang, you get an error like "inlining failed in call to always_inline foobar
" in this case, instead of working but slow asm. (See this for details). This is something Rust should probably sort out before this is ready for prime time, either be like MSVC and actually inline the instruction into a function using the intrinsic, or refuse to compile like GCC/clang.
Footnote 1: See How to emulate _mm256_loadu_epi32 with gcc or clang? if you didn't mean to use AVX512.
With -O -C target-cpu=skylake
(just AVX2), it inlines everything else, including vpaddd ymm
, but still calls out to a function that copies 32 bytes from memory to memory with AVX vmovaps
. It requires AVX512VL to inline the intrinsic, but later in the optimization process it realizes that with no masking, it's just a 256-bit load it should do without a bloated AVX-512 instruction. It's kinda dumb that Intel even provided a no-masking version of _mm256_mask[z]_loadu_epi32
that requires AVX-512. Or dumb that gcc/clang/rustc consider it an AVX512 intrinsic.