How do you handle indivisible vector lengths with SIMD intrinsics, array not a multiple of vector width?

I am currently learning how to work with SIMD intrinsics. I know that an AVX 256-bit vector can contain four doubles, eight floats, or eight 32-bit integers. How do we use AVX to process arrays that aren't a multiple of these numbers.

For example, how would you add two std::vectors of 53 integers each? Would we slice as many of the vector that would fit in the SIMD vector and just manually process the remainder? Is there a better way to do this?

Solution

Would we slice as many of the vector that would fit in the SIMD vector and just manually process the remainder? Is there a better way to do this?

Pretty much this. A basic example that processes all number in batches of 8, and uses mask load/maskstore to handle the remainder.

void add(int* const r, const int* const a, const int* const b, const unsigned count) {

    // how many blocks of 8, and how many left over
    const unsigned c8 = count & ~0x7U;
    const unsigned cr = count & 0x7U;

    // process blocks of 8
    for(unsigned i = 0; i < c8; i += 8) {
        __m256i _a = _mm256_loadu_si256((__m256i*)(a + i));
        __m256i _b = _mm256_loadu_si256((__m256i*)(b + i));
        __m256i _c = _mm256_add_epi32(_a, _b);
        _mm256_storeu_si256((__m256i*)(c + i), _c);
    }

    const __m128i temp[5] = {
        _mm_setr_epi32(0, 0, 0, 0),
        _mm_setr_epi32(-1, 0, 0, 0),
        _mm_setr_epi32(-1, -1, 0, 0),
        _mm_setr_epi32(-1, -1, -1, 0),
        _mm_setr_epi32(-1, -1, -1, -1)
    };

    // I'm using mask load / mask store for the remainder here. 
    // (this is not the only approach)
    __m256i mask;
    if(cr >= 4) { 
        mask = _mm256_set_m128i(temp[cr&3], temp[4]);
    } else {
        mask = _mm256_set_m128i(temp[0], temp[cr]);
    }
    __m256i _a = _mm256_maskload_epi32((a + c8), mask);
    __m256i _b = _mm256_maskload_epi32((b + c8), mask);
    __m256i _c = _mm256_add_epi32(_a, _b);
    _mm256_maskstore_epi32((c + c8), mask, _c);
}

Of course, if you happen to use your own containers (or provide your own allocators), then you can avoid most of this mess by simply ensuring all container allocations occur in multiples of 256bits.

// yes, this class is missing a lot... 
class MyIntArray {
public:

   MyIntArray(unsigned count, const int* data) {
      // bump capacity to next multiple of 8
      unsigned cap = count & 7;
      if(cap) cap = 8 - cap;
      capacity = cap + count;
      // allocation is aligned to 256bit
      alloc = new int[capacity];
      size = count;
      memcpy(alloc, data, sizeof(int) * size);
   }

   MyIntArray(unsigned count) {
      // bump capacity to next multiple of 8
      unsigned cap = count & 7;
      if(cap) cap = 8 - cap;
      capacity = cap + count;
      // allocation is aligned to 256bit
      alloc = new int[capacity];
      size = count;
   }

   unsigned capacity;
   unsigned size;
   int* alloc;

   int* begin() { return alloc; }
   int* end() { return alloc + size; }
   const int* begin() const { return alloc; }
   const int* end() const { return alloc + size; }
};

void add(MyIntArray r, const MyIntArray a, const MyIntArray b) {

    // process blocks of 8.
    // we may be stamping beyond the end of the array, but not over the 
    // the end of the capacity allocation....
    // (probably also want to check to see if the sizes match!).
    for(unsigned i = 0; i < r.size; i += 8) {
        __m256i _a = _mm256_loadu_si256((__m256i*)(a.alloc + i));
        __m256i _b = _mm256_loadu_si256((__m256i*)(b.alloc + i));
        __m256i _c = _mm256_add_epi32(_a, _b);
        _mm256_storeu_si256((__m256i*)(c.alloc + i), _c);
    }
}