Search code examples
csimdmmxaltivec

Porting MMX/SSE instructions to AltiVec


Let me preface this with.. I have extremely limited experience with ASM, and even less with SIMD.

But it happens that I have the following MMX/SSE optimised code, that I would like to port across to AltiVec instructions for use on PPC/Cell processors.

This is probably a big ask.. Even though it's only a few lines of code, I've had no end of trouble trying to work out what's going on here.

The original function:

static inline int convolve(const short *a, const short *b, int n)
{
    int out = 0;
    union {
        __m64 m64;
        int i32[2];
    } tmp;
    tmp.i32[0] = 0;
    tmp.i32[1] = 0;
    while (n >= 4) {
        tmp.m64 = _mm_add_pi32(tmp.m64,
                               _mm_madd_pi16(*((__m64 *)a),
                                             *((__m64 *)b)));
        a += 4;
        b += 4;
        n -= 4;
    }
    out = tmp.i32[0] + tmp.i32[1];
    _mm_empty();

    while (n --)
        out += (*(a++)) * (*(b++));
    return out;
}

Any tips on how I might rewrite this to use AltiVec instructions?

My first attempt (a very wrong attempt) looks something like this.. But it's not entirely (or even remotely) correct.

static inline int convolve_altivec(const short *a, const short *b, int n)
{
    int out = 0;
    union {
        vector unsigned int m128;
        int i64[2];
    } tmp;

    vector unsigned int zero = {0, 0, 0, 0};

    tmp.i64[0] = 0;
    tmp.i64[1] = 0;
    while (n >= 8) {
        tmp.m128 = vec_add(tmp.m128,
                               vec_msum(*((vector unsigned short *)a),
                                             *((vector unsigned short *)b), zero));

        a += 8;
        b += 8;
        n -= 8;
    }
    out = tmp.i64[0] + tmp.i64[1];
#endif
    while (n --)
        out += (*(a++)) * (*(b++));
    return out;
}

Solution

  • You're not far off - I fixed a few minor problems, cleaned up the code a little, added a test harness, and it seems to work OK now:

    #include <assert.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <altivec.h>
    
    static int convolve_ref(const short *a, const short *b, int n)
    {
        int out = 0;
        int i;
    
        for (i = 0; i < n; ++i)
        {
            out += a[i] * b[i];
        }
    
        return out;
    }
    
    static inline int convolve_altivec(const short *a, const short *b, int n)
    {
        int out = 0;
        union {
            vector signed int m128;
            int i32[4];
        } tmp;
    
        const vector signed int zero = {0, 0, 0, 0};
    
        assert(((unsigned long)a & 15) == 0);
        assert(((unsigned long)b & 15) == 0);
    
        tmp.m128 = zero;
    
        while (n >= 8)
        {
            tmp.m128 = vec_msum(*((vector signed short *)a),
                                *((vector signed short *)b), tmp.m128);
    
            a += 8;
            b += 8;
            n -= 8;
        }
    
        out = tmp.i32[0] + tmp.i32[1] + tmp.i32[2] + tmp.i32[3];
    
        while (n --)
            out += (*(a++)) * (*(b++));
    
        return out;
    }
    
    int main(void)
    {
        const int n = 100;
    
        vector signed short _a[n / 8 + 1];
        vector signed short _b[n / 8 + 1];
    
        short *a = (short *)_a;
        short *b = (short *)_b;
    
        int sum_ref, sum_test;
    
        int i;
    
        for (i = 0; i < n; ++i)
        {
            a[i] = rand();
            b[i] = rand();
        }
    
        sum_ref = convolve_ref(a, b, n);
        sum_test = convolve_altivec(a, b, n);
    
        printf("sum_ref = %d\n", sum_ref);
        printf("sum_test = %d\n", sum_test);
    
        printf("%s\n", sum_ref == sum_test ? "PASS" : "FAIL");
    
        return 0;
    }