Efficiently load/compute/pack 64 double comparison results in uint64_t bitmask

I want to load/compare/pack as runtime efficent as possible the results of 64 double comparisons into a uint64_t bitmask.

My current approach is to compare 2*2 pairs via AVX2 using _mm256_cmp_pd. The result of both (=8) comparisons is converted into a bitmap using _mm256_movemask_pd and via a|b<<4 assigned as byte into a union (1x uint64_t / 8 uint8_t) to save a few shifts/or's.

This example might help to visualize

union ui64 {
    uint64_t i64;
    struct { uint8_t i0,i1,i2,i3,i4,i5,i6,i7; } i8;
};
static inline uint64_t cmp64 (double* in0, double* in1) {
    ui64 result;
    // +0
    result.i8.i0 =
        _mm256_movemask_pd(
            _mm256_cmp_pd(
            _mm256_load_pd(in0 + 0), 
            _mm256_load_pd(in1 + 0), 
            _CMP_LT_OQ)) |
        _mm256_movemask_pd(
            _mm256_cmp_pd(
                _mm256_load_pd(in0 + 4), 
                _mm256_load_pd(in1 + 4),
                _CMP_LT_OQ)) << 4;

    // +8
    // load, compare, pack n+8 each 'iteration' using result.i8.i1,...i7
    // ...

    return result.i64;
}

Variants of compress&set appear straight forward, but use slower instructions: 1x _mm256_set_m128 and 2x_mm256_cvtpd_ps versus a scalar << and | like this

_mm256_movemask_ps(_mm256_set_m128(
    _mm256_cvtpd_ps(v0),
    _mm256_cvtpd_ps(v1)));

Used CPU is a Zen 1 (max AVX2), not sure if GPU usage(Nvidia) is an option.

Please share your thoughts.

Solution

Here’s an example. It packs the comparison results into bytes with whatever instructions were the most efficient, shuffles into correct order once per 32 numbers, and uses _mm256_movemask_epi8 to produce 32 bits at once.

// Compare 4 numbers, return 32 bytes with results of 4 comparisons:
// 00000000 11111111 22222222 33333333
inline __m256d compare4( const double* a, const double* b )
{
    return _mm256_cmp_pd( _mm256_load_pd( a ), _mm256_load_pd( b ), _CMP_LT_OQ );
}

// Compare 8 numbers, combine 8 results into the following bytes:
// 0000 4444 1111 5555  2222 6666 3333 7777
inline __m256i compare8( const double* a, const double* b )
{
    __m256 c0 = _mm256_castpd_ps( compare4( a, b ) );
    __m256 c1 = _mm256_castpd_ps( compare4( a + 4, b + 4 ) );
    return _mm256_castps_si256( _mm256_blend_ps( c0, c1, 0b10101010 ) );
}

// Compare 16 numbers, combine 16 results into following bytes:
// 00 44 11 55  88 CC 99 DD  22 66 33 77  AA EE BB FF
inline __m256i compare16( const double* a, const double* b )
{
    __m256i c0 = compare8( a, b );
    __m256i c1 = compare8( a + 8, b + 8 );
    return _mm256_packs_epi32( c0, c1 );
}

inline uint32_t compare32( const double* a, const double* b )
{
    // Compare 32 numbers and merge them into a single vector
    __m256i c0 = compare16( a, b );
    __m256i c1 = compare16( a + 16, b + 16 );
    __m256i src = _mm256_packs_epi16( c0, c1 );

    // We got the 32 bytes, but the byte order is screwed up in that vector:
    // 0   4   1   5   8   12  9   13  16  20  17  21  24  28  25  29
    // 2   6   3   7   10  14  11  15  18  22  19  23  26  30  27  31
    // The following 2 instructions are fixing the order

    // Shuffle 8-byte pieces across the complete vector
    // That instruction is relatively expensive on most CPUs, but we only doing it once per 32 numbers
    src = _mm256_permute4x64_epi64( src, _MM_SHUFFLE( 3, 1, 2, 0 ) );

    // The order of bytes in the vector is still wrong:
    // 0    4   1   5   8  12   9  13    2   6   3   7  10  14  11  15
    // 13  16  20  17  21  24  28  25   18  22  19  23  26  30  27  31
    // Need one more shuffle instruction

    const __m128i perm16 = _mm_setr_epi8(
        0, 2, 8, 10,  1, 3, 9, 11,   4, 6, 12, 14,  5, 7, 13, 15 );
    // If you calling this in a loop and everything is inlined,
    // the shuffling vector should be pre-loaded outside of the loop.
    const __m256i perm = _mm256_broadcastsi128_si256( perm16 );

    // Shuffle the bytes
    src = _mm256_shuffle_epi8( src, perm );

    // The order of bytes is now correct, can use _mm256_movemask_epi8 to make 32 bits of the result
    return (uint32_t)_mm256_movemask_epi8( src );
}


uint64_t compareAndPack64( const double* a, const double* b )
{
    uint64_t low = compare32( a, b );
    uint64_t high = compare32( a + 32, b + 32 );
    return low | ( high << 32 );
}