I'm optimizing a hot path in my codebase and i have turned to vectorization. Keep in mind, I'm still quite new to all of this SIMD stuff. Here is the problem I'm trying to solve, implemented using non-SIMD
inline int count_unique(int c1, int c2, int c3, int c4)
{
return 4 - (c2 == c1)
- ((c3 == c1) || (c3 == c2))
- ((c4 == c1) || (c4 == c2) || (c4 == c3));
}
the assembly output after compiling with -O3
:
count_unique:
xor eax, eax
cmp esi, edi
mov r8d, edx
setne al
add eax, 3
cmp edi, edx
sete dl
cmp esi, r8d
sete r9b
or edx, r9d
movzx edx, dl
sub eax, edx
cmp edi, ecx
sete dl
cmp r8d, ecx
sete dil
or edx, edi
cmp esi, ecx
sete cl
or edx, ecx
movzx edx, dl
sub eax, edx
ret
How would something like this be done when storing c1,c2,c3,c4 as a 16byte integer vector?
For your simplified problem (test all 4 lanes for equality), I would do it slightly differently, here’s how. This way it only takes 3 instructions for the complete test.
// True when the input vector has the same value in all 32-bit lanes
inline bool isSameValue( __m128i v )
{
// Rotate vector by 4 bytes
__m128i v2 = _mm_shuffle_epi32( v, _MM_SHUFFLE( 0, 3, 2, 1 ) );
// The XOR outputs zero for equal bits, 1 for different bits
__m128i xx = _mm_xor_si128( v, v2 );
// Use PTEST instruction from SSE 4.1 set to test the complete vector for all zeros
return (bool)_mm_testz_si128( xx, xx );
}