Search code examples
c++vectorfloating-pointsimdaltivec

AltiVec vec_msum equivalent for float values


Is anybody aware of a method to achieve vec_msum functionality against a vector of float values?

I'm quite new to SIMD, and although I think I'm starting to make sense of it - there are still a few puzzles.

My end goal is to rewrite the function "convolve_altivec" (as found in the accepted answer for this question) such that it accepts input parameters as float values, instead of short's.

That is, the prototype should be

float convolve_altivec(const float *a, const float *b, int n)

I'm trying to match the functionality provided by the original non-optimised function below:

float convolve(const float *a, const float *b, int n)
{
    float out = 0.f;
    while (n --)
        out += (*(a ++)) * (*(b ++));
    return out;
}

My initial efforts have seen me trying to port an existing SSE version of this same function to altivec instructions.


Solution

  • For a float version you need vec_madd.

    Here's a float version of the previous 16 bit int version and test harness I posted in response to your earlier question:

    #include <assert.h>
    #include <math.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <altivec.h>
    
    static float convolve_ref(const float *a, const float *b, int n)
    {
        float sum = 0.0f;
        int i;
    
        for (i = 0; i < n; ++i)
        {
            sum += a[i] * b[i];
        }
    
        return sum;
    }
    
    static inline float convolve_altivec(const float *a, const float *b, int n)
    {
        float sum = 0.0f;
        vector float vsum = { 0.0f, 0.0f, 0.0f, 0.0f };
        union {
            vector float v;
            float a[4];
        } usum;
    
        vector float *pa = (vector float *)a;
        vector float *pb = (vector float *)b;
    
        assert(((unsigned long)a & 15) == 0);
        assert(((unsigned long)b & 15) == 0);
    
        while (n >= 4)
        {
            vsum = vec_madd(*pa, *pb, vsum);
            pa++;
            pb++;
            n -= 4;
        }
    
        usum.v = vsum;
    
        sum = usum.a[0] + usum.a[1] + usum.a[2] + usum.a[3];
    
        a = (float *)pa;
        b = (float *)pb;
    
        while (n --)
        {
            sum += (*a++ * *b++);
        }
    
        return sum;
    }
    
    int main(void)
    {
        const int n = 1002;
    
        vector float _a[n / 4 + 1];
        vector float _b[n / 4 + 1];
    
        float *a = (float *)_a;
        float *b = (float *)_b;
    
        float sum_ref, sum_test;
    
        int i;
    
        for (i = 0; i < n; ++i)
        {
            a[i] = (float)rand();
            b[i] = (float)rand();
        }
    
        sum_ref = convolve_ref(a, b, n);
        sum_test = convolve_altivec(a, b, n);
    
        printf("sum_ref = %g\n", sum_ref);
        printf("sum_test = %g\n", sum_test);
    
        printf("%s\n", fabsf((sum_ref - sum_test) / sum_ref) < 1.0e-6 ? "PASS" : "FAIL");
    
        return 0;
    }