Is anybody aware of a method to achieve vec_msum functionality against a vector of float values?
I'm quite new to SIMD, and although I think I'm starting to make sense of it - there are still a few puzzles.
My end goal is to rewrite the function "convolve_altivec" (as found in the accepted answer for this question) such that it accepts input parameters as float values, instead of short's.
That is, the prototype should be
float convolve_altivec(const float *a, const float *b, int n)
I'm trying to match the functionality provided by the original non-optimised function below:
float convolve(const float *a, const float *b, int n)
{
float out = 0.f;
while (n --)
out += (*(a ++)) * (*(b ++));
return out;
}
My initial efforts have seen me trying to port an existing SSE version of this same function to altivec instructions.
For a float version you need vec_madd
.
Here's a float version of the previous 16 bit int version and test harness I posted in response to your earlier question:
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <altivec.h>
static float convolve_ref(const float *a, const float *b, int n)
{
float sum = 0.0f;
int i;
for (i = 0; i < n; ++i)
{
sum += a[i] * b[i];
}
return sum;
}
static inline float convolve_altivec(const float *a, const float *b, int n)
{
float sum = 0.0f;
vector float vsum = { 0.0f, 0.0f, 0.0f, 0.0f };
union {
vector float v;
float a[4];
} usum;
vector float *pa = (vector float *)a;
vector float *pb = (vector float *)b;
assert(((unsigned long)a & 15) == 0);
assert(((unsigned long)b & 15) == 0);
while (n >= 4)
{
vsum = vec_madd(*pa, *pb, vsum);
pa++;
pb++;
n -= 4;
}
usum.v = vsum;
sum = usum.a[0] + usum.a[1] + usum.a[2] + usum.a[3];
a = (float *)pa;
b = (float *)pb;
while (n --)
{
sum += (*a++ * *b++);
}
return sum;
}
int main(void)
{
const int n = 1002;
vector float _a[n / 4 + 1];
vector float _b[n / 4 + 1];
float *a = (float *)_a;
float *b = (float *)_b;
float sum_ref, sum_test;
int i;
for (i = 0; i < n; ++i)
{
a[i] = (float)rand();
b[i] = (float)rand();
}
sum_ref = convolve_ref(a, b, n);
sum_test = convolve_altivec(a, b, n);
printf("sum_ref = %g\n", sum_ref);
printf("sum_test = %g\n", sum_test);
printf("%s\n", fabsf((sum_ref - sum_test) / sum_ref) < 1.0e-6 ? "PASS" : "FAIL");
return 0;
}