I have a loop that's adding int16s from two arrays together via _mm_add_epi16()
. There's a small array and a large array, the results get written back to the large array.
The intrinsic may get less than 8x int16s (128 bits) from the small array if it's reached its end - how do I store the results of _mm_add_epi16()
back into standard memory int16_t* when I don't want all of its 128 bits? Padding the array to power-of-two is not an option. Example:
int16_t* smallArray;
int16_t* largeArray;
__m128i inSmallArray = _mm_load_si128((__m128i*)smallArray);
__m128i* pInLargeArray = (__m128i*)largeArray;
__m128i inLargeArray = _mm_load_si128(pInLargeArray);
inLargeArray = _mm_add_epi16(inLargeArray, inSmallArray);
_mm_store_si128(pInLargeArray, inLargeArray);
My guess is that I need to substitute _mm_store_si128()
with a "masked" store somehow.
There is a _mm_maskmoveu_si128
intrinsic, which translates to maskmovdqu
(in SSE) or vmaskmovdqu
(in AVX).
// Store masks. The highest bit in each byte indicates the byte to store.
alignas(16) const unsigned char masks[16][16] =
{
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00 }
};
void store_n(__m128i mm, unsigned int n, void* storage)
{
assert(n < 16u);
_mm_maskmoveu_si128(mm, reinterpret_cast< const __m128i& >(masks[n]), static_cast< char* >(storage));
}
The problem with this code is that maskmovdqu
(and, presumably, vmaskmovdqu
) instructions have an associated hint for non-temporal access to the target memory, which makes the instruction expensive and also requires a fence afterwards.
AVX adds new instructions vmaskmovps
/vmaskmovpd
(and AVX2 also adds vpmaskmovd
/vpmaskmovq
), which work similarly to vmaskmovdqu
but do not have the non-temporal hint and only operate on 32 and 64-bit granularity.
// Store masks. The highest bit in each 32-bit element indicates the element to store.
alignas(16) const unsigned char masks[4][16] =
{
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00 }
};
void store_n(__m128i mm, unsigned int n, void* storage)
{
assert(n < 4u);
_mm_maskstore_epi32(static_cast< int* >(storage), reinterpret_cast< const __m128i& >(masks[n]), mm);
}
AVX-512 adds masked stores, and you could use vmovdqu8
/vmovdqu16
with an appropriate mask to store 8 or 16-bit elements.
void store_n(__m128i mm, unsigned int n, void* storage)
{
assert(n < 16u);
_mm_mask_storeu_epi8(storage, static_cast< __mmask16 >((1u << n) - 1u), mm);
}
Note that the above requires AVX-512BW and VL extensions.
If you require 8 or 16-bit granularity and don't have AVX-512 then you're better off with a function that manually stores the vector register piece by piece.
void store_n(__m128i mm, unsigned int n, void* storage)
{
assert(n < 16u);
unsigned char* p = static_cast< unsigned char* >(storage);
if (n >= 8u)
{
_mm_storel_epi64(reinterpret_cast< __m128i* >(p), mm);
mm = _mm_unpackhi_epi64(mm, mm); // move high 8 bytes to the low 8 bytes
n -= 8u;
p += 8;
}
if (n >= 4u)
{
std::uint32_t data = _mm_cvtsi128_si32(mm);
std::memcpy(p, &data, sizeof(data)); // typically generates movd
mm = _mm_srli_si128(mm, 4);
n -= 4u;
p += 4;
}
if (n >= 2u)
{
std::uint16_t data = _mm_extract_epi16(mm, 0); // or _mm_cvtsi128_si32
std::memcpy(p, &data, sizeof(data));
mm = _mm_srli_si128(mm, 2);
n -= 2u;
p += 2;
}
if (n > 0u)
{
std::uint32_t data = _mm_cvtsi128_si32(mm);
*p = static_cast< std::uint8_t >(data);
}
}