SSE Intrinsics and loop unrolling

I am attempting to optimise some loops and I have managed but I wonder if I have only done it partially correct. Say for example that I have this loop:

for(i=0;i<n;i++){
b[i] = a[i]*2;
}

unrolling this by a factor of 3, produces this:

int unroll = (n/4)*4;
for(i=0;i<unroll;i+=4)
{
b[i] = a[i]*2;
b[i+1] = a[i+1]*2;
b[i+2] = a[i+2]*2;
b[i+3] = a[i+3]*2;
}
for(;i<n;i++)
{
b[i] = a[i]*2;
}

Now is the SSE translation equivalent:

__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v, two_v);
_mm_storeu_ps(&b[i], ai2_v);

or is it:

__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v, two_v);
_mm_storeu_ps(&b[i], ai2_v);

__m128 ai1_v = _mm_loadu_ps(&a[i+1]);
__m128 two1_v = _mm_set1_ps(2);
__m128 ai_1_2_v = _mm_mul_ps(ai1_v, two1_v);
_mm_storeu_ps(&b[i+1], ai_1_2_v);

__m128 ai2_v = _mm_loadu_ps(&a[i+2]);
__m128 two2_v = _mm_set1_ps(2);
__m128 ai_2_2_v = _mm_mul_ps(ai2_v, two2_v);
_mm_storeu_ps(&b[i+2], ai_2_2_v);

__m128 ai3_v = _mm_loadu_ps(&a[i+3]);
__m128 two3_v = _mm_set1_ps(2);
__m128 ai_3_2_v = _mm_mul_ps(ai3_v, two3_v);
_mm_storeu_ps(&b[i+3], ai_3_2_v);

I am slightly confused about the section of code:

for(;i<n;i++)
{
b[i] = a[i]*2;
}

what does this do? Is it just to do the extra parts for example if the loop is not dividable by the factor you choose to unroll it by? Thank you.

Solution

The answer is the first block:

    __m128 ai_v = _mm_loadu_ps(&a[i]);
    __m128 two_v = _mm_set1_ps(2);
    __m128 ai2_v = _mm_mul_ps(ai_v,two_v);
    _mm_storeu_ps(&b[i],ai2_v);

It already takes four variables at a time.

Here is the full program with the equivalent section of code commented out:

#include <iostream>

int main()
{
    int i{0};
    float a[10] ={1,2,3,4,5,6,7,8,9,10};
    float b[10] ={0,0,0,0,0,0,0,0,0,0};

    int n = 10;
    int unroll = (n/4)*4;
    for (i=0; i<unroll; i+=4) {
        //b[i] = a[i]*2;
        //b[i+1] = a[i+1]*2;
        //b[i+2] = a[i+2]*2;
        //b[i+3] = a[i+3]*2;
        __m128 ai_v = _mm_loadu_ps(&a[i]);
        __m128 two_v = _mm_set1_ps(2);
        __m128 ai2_v = _mm_mul_ps(ai_v,two_v);
        _mm_storeu_ps(&b[i],ai2_v);
    }

    for (; i<n; i++) {
        b[i] = a[i]*2;
    }

    for (auto i : a) { std::cout << i << "\t"; }
    std::cout << "\n";
    for (auto i : b) { std::cout << i << "\t"; }
    std::cout << "\n";

    return 0;
}

As for efficiency; it seems that the assembly on my system generates movups instructions, whereas the hand rolled code could be made to use movaps which should be faster.

I used the following program to do some benchmarks:

#include <iostream>
//#define NO_UNROLL
//#define UNROLL
//#define SSE_UNROLL
#define SSE_UNROLL_ALIGNED

int main()
{
    const size_t array_size = 100003;
#ifdef SSE_UNROLL_ALIGNED
    __declspec(align(16)) int i{0};
    __declspec(align(16)) float a[array_size] ={1,2,3,4,5,6,7,8,9,10};
    __declspec(align(16)) float b[array_size] ={0,0,0,0,0,0,0,0,0,0};
#endif
#ifndef SSE_UNROLL_ALIGNED
    int i{0};
    float a[array_size] ={1,2,3,4,5,6,7,8,9,10};
    float b[array_size] ={0,0,0,0,0,0,0,0,0,0};
#endif

    int n = array_size;
    int unroll = (n/4)*4;


    for (size_t j{0}; j < 100000; ++j) {
#ifdef NO_UNROLL
        for (i=0; i<n; i++) {
            b[i] = a[i]*2;
        }
#endif
#ifdef UNROLL
        for (i=0; i<unroll; i+=4) {
            b[i] = a[i]*2;
            b[i+1] = a[i+1]*2;
            b[i+2] = a[i+2]*2;
            b[i+3] = a[i+3]*2;
        }
#endif
#ifdef SSE_UNROLL
        for (i=0; i<unroll; i+=4) {
            __m128 ai_v = _mm_loadu_ps(&a[i]);
            __m128 two_v = _mm_set1_ps(2);
            __m128 ai2_v = _mm_mul_ps(ai_v,two_v);
            _mm_storeu_ps(&b[i],ai2_v);
        }
#endif
#ifdef SSE_UNROLL_ALIGNED
        for (i=0; i<unroll; i+=4) {
            __m128 ai_v = _mm_load_ps(&a[i]);
            __m128 two_v = _mm_set1_ps(2);
            __m128 ai2_v = _mm_mul_ps(ai_v,two_v);
            _mm_store_ps(&b[i],ai2_v);
        }
#endif
#ifndef NO_UNROLL
        for (; i<n; i++) {
            b[i] = a[i]*2;
        }
#endif
    }

    //for (auto i : a) { std::cout << i << "\t"; }
    //std::cout << "\n";
    //for (auto i : b) { std::cout << i << "\t"; }
    //std::cout << "\n";

    return 0;
}

I got the following results (x86):

NO_UNROLL: 0.994 seconds, no SSE chosen by compiler
UNROLL: 3.511 seconds, uses movups
SSE_UNROLL: 3.315 seconds, uses movups
SSE_UNROLL_ALIGNED: 3.276 seconds, uses movaps

So it is clear that unrolling the loop has not helped in this case. Even ensuring that we use the more efficient movaps doesn't help much.

But I got an even stranger result when compiling to 64 bit (x64):

NO_UNROLL: 1.138 seconds, no SSE chosen by compiler
UNROLL: 1.409 seconds, no SSE chosen by compiler
SSE_UNROLL: 1.420 seconds, still no SSE chosen by compiler!
SSE_UNROLL_ALIGNED: 1.476 seconds, still no SSE chosen by compiler!

It seems MSVC sees through the proposal and generates better assembly regardless, albeit still slower than had we not tried any hand optimization at all.