Search code examples
c++ssesimdintrinsicssse2

How to copy X bytes or bits from an __m128i into standard memory


I have a loop that's adding int16s from two arrays together via _mm_add_epi16(). There's a small array and a large array, the results get written back to the large array. The intrinsic may get less than 8x int16s (128 bits) from the small array if it's reached its end - how do I store the results of _mm_add_epi16() back into standard memory int16_t* when I don't want all of its 128 bits? Padding the array to power-of-two is not an option. Example:

int16_t* smallArray;
int16_t* largeArray;
__m128i inSmallArray = _mm_load_si128((__m128i*)smallArray);
__m128i* pInLargeArray = (__m128i*)largeArray;
__m128i inLargeArray = _mm_load_si128(pInLargeArray);
inLargeArray = _mm_add_epi16(inLargeArray, inSmallArray);
_mm_store_si128(pInLargeArray, inLargeArray);

My guess is that I need to substitute _mm_store_si128() with a "masked" store somehow.


Solution

  • There is a _mm_maskmoveu_si128 intrinsic, which translates to maskmovdqu (in SSE) or vmaskmovdqu (in AVX).

    // Store masks. The highest bit in each byte indicates the byte to store.
    alignas(16) const unsigned char masks[16][16] =
    {
        { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00 }
    };
    
    void store_n(__m128i mm, unsigned int n, void* storage)
    {
        assert(n < 16u);
        _mm_maskmoveu_si128(mm, reinterpret_cast< const __m128i& >(masks[n]), static_cast< char* >(storage));
    }
    

    The problem with this code is that maskmovdqu (and, presumably, vmaskmovdqu) instructions have an associated hint for non-temporal access to the target memory, which makes the instruction expensive and also requires a fence afterwards.

    AVX adds new instructions vmaskmovps/vmaskmovpd (and AVX2 also adds vpmaskmovd/vpmaskmovq), which work similarly to vmaskmovdqu but do not have the non-temporal hint and only operate on 32 and 64-bit granularity.

    // Store masks. The highest bit in each 32-bit element indicates the element to store.
    alignas(16) const unsigned char masks[4][16] =
    {
        { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
        { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00 }
    };
    
    void store_n(__m128i mm, unsigned int n, void* storage)
    {
        assert(n < 4u);
        _mm_maskstore_epi32(static_cast< int* >(storage), reinterpret_cast< const __m128i& >(masks[n]), mm);
    }
    

    AVX-512 adds masked stores, and you could use vmovdqu8/vmovdqu16 with an appropriate mask to store 8 or 16-bit elements.

    void store_n(__m128i mm, unsigned int n, void* storage)
    {
        assert(n < 16u);
        _mm_mask_storeu_epi8(storage, static_cast< __mmask16 >((1u << n) - 1u), mm);
    }
    

    Note that the above requires AVX-512BW and VL extensions.

    If you require 8 or 16-bit granularity and don't have AVX-512 then you're better off with a function that manually stores the vector register piece by piece.

    void store_n(__m128i mm, unsigned int n, void* storage)
    {
        assert(n < 16u);
    
        unsigned char* p = static_cast< unsigned char* >(storage);
        if (n >= 8u)
        {
            _mm_storel_epi64(reinterpret_cast< __m128i* >(p), mm);
            mm = _mm_unpackhi_epi64(mm, mm); // move high 8 bytes to the low 8 bytes
            n -= 8u;
            p += 8;
        }
    
        if (n >= 4u)
        {
            std::uint32_t data = _mm_cvtsi128_si32(mm);
            std::memcpy(p, &data, sizeof(data)); // typically generates movd
            mm = _mm_srli_si128(mm, 4);
            n -= 4u;
            p += 4;
        }
    
        if (n >= 2u)
        {
            std::uint16_t data = _mm_extract_epi16(mm, 0); // or _mm_cvtsi128_si32
            std::memcpy(p, &data, sizeof(data));
            mm = _mm_srli_si128(mm, 2);
            n -= 2u;
            p += 2;
        }
    
        if (n > 0u)
        {
            std::uint32_t data = _mm_cvtsi128_si32(mm);
            *p = static_cast< std::uint8_t >(data);
        }
    }