Search code examples
c++optimizationsseloop-unrolling

SSE Intrinsics and loop unrolling


I am attempting to optimise some loops and I have managed but I wonder if I have only done it partially correct. Say for example that I have this loop:

for(i=0;i<n;i++){
b[i] = a[i]*2;
}

unrolling this by a factor of 3, produces this:

int unroll = (n/4)*4;
for(i=0;i<unroll;i+=4)
{
b[i] = a[i]*2;
b[i+1] = a[i+1]*2;
b[i+2] = a[i+2]*2;
b[i+3] = a[i+3]*2;
}
for(;i<n;i++)
{
b[i] = a[i]*2;
} 

Now is the SSE translation equivalent:

__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v, two_v);
_mm_storeu_ps(&b[i], ai2_v);

or is it:

__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v, two_v);
_mm_storeu_ps(&b[i], ai2_v);

__m128 ai1_v = _mm_loadu_ps(&a[i+1]);
__m128 two1_v = _mm_set1_ps(2);
__m128 ai_1_2_v = _mm_mul_ps(ai1_v, two1_v);
_mm_storeu_ps(&b[i+1], ai_1_2_v);

__m128 ai2_v = _mm_loadu_ps(&a[i+2]);
__m128 two2_v = _mm_set1_ps(2);
__m128 ai_2_2_v = _mm_mul_ps(ai2_v, two2_v);
_mm_storeu_ps(&b[i+2], ai_2_2_v);

__m128 ai3_v = _mm_loadu_ps(&a[i+3]);
__m128 two3_v = _mm_set1_ps(2);
__m128 ai_3_2_v = _mm_mul_ps(ai3_v, two3_v);
_mm_storeu_ps(&b[i+3], ai_3_2_v);

I am slightly confused about the section of code:

for(;i<n;i++)
{
b[i] = a[i]*2;
}

what does this do? Is it just to do the extra parts for example if the loop is not dividable by the factor you choose to unroll it by? Thank you.


Solution

  • The answer is the first block:

        __m128 ai_v = _mm_loadu_ps(&a[i]);
        __m128 two_v = _mm_set1_ps(2);
        __m128 ai2_v = _mm_mul_ps(ai_v,two_v);
        _mm_storeu_ps(&b[i],ai2_v);
    

    It already takes four variables at a time.

    Here is the full program with the equivalent section of code commented out:

    #include <iostream>
    
    int main()
    {
        int i{0};
        float a[10] ={1,2,3,4,5,6,7,8,9,10};
        float b[10] ={0,0,0,0,0,0,0,0,0,0};
    
        int n = 10;
        int unroll = (n/4)*4;
        for (i=0; i<unroll; i+=4) {
            //b[i] = a[i]*2;
            //b[i+1] = a[i+1]*2;
            //b[i+2] = a[i+2]*2;
            //b[i+3] = a[i+3]*2;
            __m128 ai_v = _mm_loadu_ps(&a[i]);
            __m128 two_v = _mm_set1_ps(2);
            __m128 ai2_v = _mm_mul_ps(ai_v,two_v);
            _mm_storeu_ps(&b[i],ai2_v);
        }
    
        for (; i<n; i++) {
            b[i] = a[i]*2;
        }
    
        for (auto i : a) { std::cout << i << "\t"; }
        std::cout << "\n";
        for (auto i : b) { std::cout << i << "\t"; }
        std::cout << "\n";
    
        return 0;
    }
    

    As for efficiency; it seems that the assembly on my system generates movups instructions, whereas the hand rolled code could be made to use movaps which should be faster.

    I used the following program to do some benchmarks:

    #include <iostream>
    //#define NO_UNROLL
    //#define UNROLL
    //#define SSE_UNROLL
    #define SSE_UNROLL_ALIGNED
    
    int main()
    {
        const size_t array_size = 100003;
    #ifdef SSE_UNROLL_ALIGNED
        __declspec(align(16)) int i{0};
        __declspec(align(16)) float a[array_size] ={1,2,3,4,5,6,7,8,9,10};
        __declspec(align(16)) float b[array_size] ={0,0,0,0,0,0,0,0,0,0};
    #endif
    #ifndef SSE_UNROLL_ALIGNED
        int i{0};
        float a[array_size] ={1,2,3,4,5,6,7,8,9,10};
        float b[array_size] ={0,0,0,0,0,0,0,0,0,0};
    #endif
    
        int n = array_size;
        int unroll = (n/4)*4;
    
    
        for (size_t j{0}; j < 100000; ++j) {
    #ifdef NO_UNROLL
            for (i=0; i<n; i++) {
                b[i] = a[i]*2;
            }
    #endif
    #ifdef UNROLL
            for (i=0; i<unroll; i+=4) {
                b[i] = a[i]*2;
                b[i+1] = a[i+1]*2;
                b[i+2] = a[i+2]*2;
                b[i+3] = a[i+3]*2;
            }
    #endif
    #ifdef SSE_UNROLL
            for (i=0; i<unroll; i+=4) {
                __m128 ai_v = _mm_loadu_ps(&a[i]);
                __m128 two_v = _mm_set1_ps(2);
                __m128 ai2_v = _mm_mul_ps(ai_v,two_v);
                _mm_storeu_ps(&b[i],ai2_v);
            }
    #endif
    #ifdef SSE_UNROLL_ALIGNED
            for (i=0; i<unroll; i+=4) {
                __m128 ai_v = _mm_load_ps(&a[i]);
                __m128 two_v = _mm_set1_ps(2);
                __m128 ai2_v = _mm_mul_ps(ai_v,two_v);
                _mm_store_ps(&b[i],ai2_v);
            }
    #endif
    #ifndef NO_UNROLL
            for (; i<n; i++) {
                b[i] = a[i]*2;
            }
    #endif
        }
    
        //for (auto i : a) { std::cout << i << "\t"; }
        //std::cout << "\n";
        //for (auto i : b) { std::cout << i << "\t"; }
        //std::cout << "\n";
    
        return 0;
    }
    

    I got the following results (x86):

    • NO_UNROLL: 0.994 seconds, no SSE chosen by compiler
    • UNROLL: 3.511 seconds, uses movups
    • SSE_UNROLL: 3.315 seconds, uses movups
    • SSE_UNROLL_ALIGNED: 3.276 seconds, uses movaps

    So it is clear that unrolling the loop has not helped in this case. Even ensuring that we use the more efficient movaps doesn't help much.

    But I got an even stranger result when compiling to 64 bit (x64):

    • NO_UNROLL: 1.138 seconds, no SSE chosen by compiler
    • UNROLL: 1.409 seconds, no SSE chosen by compiler
    • SSE_UNROLL: 1.420 seconds, still no SSE chosen by compiler!
    • SSE_UNROLL_ALIGNED: 1.476 seconds, still no SSE chosen by compiler!

    It seems MSVC sees through the proposal and generates better assembly regardless, albeit still slower than had we not tried any hand optimization at all.