Search code examples
c++optimizationsimdavxavx2

Efficiently load/compute/pack 64 double comparison results in uint64_t bitmask


I want to load/compare/pack as runtime efficent as possible the results of 64 double comparisons into a uint64_t bitmask.

My current approach is to compare 2*2 pairs via AVX2 using _mm256_cmp_pd. The result of both (=8) comparisons is converted into a bitmap using _mm256_movemask_pd and via a|b<<4 assigned as byte into a union (1x uint64_t / 8 uint8_t) to save a few shifts/or's.

This example might help to visualize

union ui64 {
    uint64_t i64;
    struct { uint8_t i0,i1,i2,i3,i4,i5,i6,i7; } i8;
};
static inline uint64_t cmp64 (double* in0, double* in1) {
    ui64 result;
    // +0
    result.i8.i0 =
        _mm256_movemask_pd(
            _mm256_cmp_pd(
            _mm256_load_pd(in0 + 0), 
            _mm256_load_pd(in1 + 0), 
            _CMP_LT_OQ)) |
        _mm256_movemask_pd(
            _mm256_cmp_pd(
                _mm256_load_pd(in0 + 4), 
                _mm256_load_pd(in1 + 4),
                _CMP_LT_OQ)) << 4;

    // +8
    // load, compare, pack n+8 each 'iteration' using result.i8.i1,...i7
    // ...

    return result.i64;
}

Variants of compress&set appear straight forward, but use slower instructions: 1x _mm256_set_m128 and 2x_mm256_cvtpd_ps versus a scalar << and | like this

_mm256_movemask_ps(_mm256_set_m128(
    _mm256_cvtpd_ps(v0),
    _mm256_cvtpd_ps(v1)));

Used CPU is a Zen 1 (max AVX2), not sure if GPU usage(Nvidia) is an option.

Please share your thoughts.


Solution

  • Here’s an example. It packs the comparison results into bytes with whatever instructions were the most efficient, shuffles into correct order once per 32 numbers, and uses _mm256_movemask_epi8 to produce 32 bits at once.

    // Compare 4 numbers, return 32 bytes with results of 4 comparisons:
    // 00000000 11111111 22222222 33333333
    inline __m256d compare4( const double* a, const double* b )
    {
        return _mm256_cmp_pd( _mm256_load_pd( a ), _mm256_load_pd( b ), _CMP_LT_OQ );
    }
    
    // Compare 8 numbers, combine 8 results into the following bytes:
    // 0000 4444 1111 5555  2222 6666 3333 7777
    inline __m256i compare8( const double* a, const double* b )
    {
        __m256 c0 = _mm256_castpd_ps( compare4( a, b ) );
        __m256 c1 = _mm256_castpd_ps( compare4( a + 4, b + 4 ) );
        return _mm256_castps_si256( _mm256_blend_ps( c0, c1, 0b10101010 ) );
    }
    
    // Compare 16 numbers, combine 16 results into following bytes:
    // 00 44 11 55  88 CC 99 DD  22 66 33 77  AA EE BB FF
    inline __m256i compare16( const double* a, const double* b )
    {
        __m256i c0 = compare8( a, b );
        __m256i c1 = compare8( a + 8, b + 8 );
        return _mm256_packs_epi32( c0, c1 );
    }
    
    inline uint32_t compare32( const double* a, const double* b )
    {
        // Compare 32 numbers and merge them into a single vector
        __m256i c0 = compare16( a, b );
        __m256i c1 = compare16( a + 16, b + 16 );
        __m256i src = _mm256_packs_epi16( c0, c1 );
    
        // We got the 32 bytes, but the byte order is screwed up in that vector:
        // 0   4   1   5   8   12  9   13  16  20  17  21  24  28  25  29
        // 2   6   3   7   10  14  11  15  18  22  19  23  26  30  27  31
        // The following 2 instructions are fixing the order
    
        // Shuffle 8-byte pieces across the complete vector
        // That instruction is relatively expensive on most CPUs, but we only doing it once per 32 numbers
        src = _mm256_permute4x64_epi64( src, _MM_SHUFFLE( 3, 1, 2, 0 ) );
    
        // The order of bytes in the vector is still wrong:
        // 0    4   1   5   8  12   9  13    2   6   3   7  10  14  11  15
        // 13  16  20  17  21  24  28  25   18  22  19  23  26  30  27  31
        // Need one more shuffle instruction
    
        const __m128i perm16 = _mm_setr_epi8(
            0, 2, 8, 10,  1, 3, 9, 11,   4, 6, 12, 14,  5, 7, 13, 15 );
        // If you calling this in a loop and everything is inlined,
        // the shuffling vector should be pre-loaded outside of the loop.
        const __m256i perm = _mm256_broadcastsi128_si256( perm16 );
    
        // Shuffle the bytes
        src = _mm256_shuffle_epi8( src, perm );
    
        // The order of bytes is now correct, can use _mm256_movemask_epi8 to make 32 bits of the result
        return (uint32_t)_mm256_movemask_epi8( src );
    }
    
    
    uint64_t compareAndPack64( const double* a, const double* b )
    {
        uint64_t low = compare32( a, b );
        uint64_t high = compare32( a + 32, b + 32 );
        return low | ( high << 32 );
    }