c++matrix-multiplication simd sse dot-product

How to vectorize a vector-matrix product with SSE?

I have this function in C++

void routine2(float alpha, float beta) {

    unsigned int i, j;

    for (i = 0; i < N; i++)
        for (j = 0; j < N; j++)
            w[i] = w[i] - beta + alpha * A[i][j] * x[j];

}

And this is the SSE(vectorized) version I wrote

void routine2_vec(float alpha, float beta) {
    __m128 alpha_2 = _mm_set1_ps(alpha);
    __m128 beta_2 = _mm_set1_ps(beta);

    __m128 w2,num1,A2,X2;
    __m128 multiple1, multiple2,sub;

    unsigned int i, j;
    

    for ( i = 0; i < N; ++i) {
        w2 = _mm_setzero_ps();
        for ( j = 0; j < (N/4)*4; j += 4) {
            num1 = _mm_loadu_ps(&w[i]);

            A2 = _mm_loadu_ps(&A[i][j]);
            X2 = _mm_loadu_ps(&x[j]);

            multiple1 = _mm_mul_ps(A2, X2);
            multiple2 = _mm_mul_ps(alpha_2, multiple1);
            sub = _mm_sub_ps(num1, beta_2);

            w2 = _mm_add_ps(sub, multiple2);
            _mm_store_ss(&w[i], w2);
        }

    }
}

The result of the both function should be the same, so I wrote another function to run and compare the result of both of them.

Test & compare functions

int routine2_test(float alpha, float beta) {
    unsigned int i, j;

    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            test2[i] = test2[i] - beta + alpha * A[i][j] * x[j];

        }
    }
  
    // compare
    for (j = 0; j < N; j++) {
        if (equal(w[j], test2[j]) == 1) {
            printf("\n The result of w[%d] is not equal to test2[%d]  \n", j, j);
            return 1;
        }
    }

    return 0;
}


unsigned short int equal(float a, float b) {
    float temp = a - b;
    //printf("\n %f  %f", a, b);
    if ((fabs(temp) / fabs(b)) < EPSILON)
        return 0; //success
    else
        return 1; //wrong result
}

The test function runs the primary code and store the result in the test2 array, and the routine2_vec function is do the same with SSE and store the result in w array, then I compare both results in the routine2_test with the equal function.

EDITED

This is remaining part including main and init functions.


#include <stdio.h>
#include <time.h>
#include <pmmintrin.h>
#include <process.h>
#include <chrono>
#include <iostream>
#include <immintrin.h>
#include <omp.h>

#define M 256*128
#define ARITHMETIC_OPERATIONS1 3*M
#define TIMES1 1

#define N  2048
#define ARITHMETIC_OPERATIONS2 4*N*N
#define TIMES2 1


constexpr auto EPSILON =0.0001;

//function declaration
void initialize();
void routine2(float alpha, float beta);

unsigned short int equal(float a, float b); 
int routine2_test(float, float);

void routine2_vec(float alpha, float beta);

__declspec(align(64)) float  test2[N];
__declspec(align(64)) float A[N][N], x[N], w[N];


int main() {

    float alpha = 0.023f, beta = 0.045f;
    double run_time, start_time;
    unsigned int t;

    initialize();

    printf("\nRoutine2:");
    start_time = omp_get_wtime(); //start timer

    for (t = 0; t < TIMES2; t++){
        // routine2(alpha, beta);
        routine2_vec(alpha, beta);
        routine2_test(alpha, beta);
}

    run_time = omp_get_wtime() - start_time; //end timer
    printf("\n Time elapsed is %f secs \n %e FLOPs achieved\n", run_time, (double)(ARITHMETIC_OPERATIONS2) / ((double)run_time / TIMES2));

    return 0;
}


void initialize() {

    unsigned int i, j;

    //initialize routine2 arrays
    for (i = 0; i < N; i++)
        for (j = 0; j < N; j++) {
            A[i][j] = (i % 99) + (j % 14) + 0.013f;
        }

    //initialize routine2 arrays
    for (i = 0; i < N; i++) {
        x[i] = (i % 19) - 0.01f;
        w[i] = (i % 5) - 0.002f;
        test2[i]= (i % 5) - 0.002f;
    }
}

Solution

We can start by optimizing the scalar version.

Avoid redundant loads and stores.
Hoist the alpha and beta out of the innermost loop

void routine2(float alpha, float beta) {
    for (unsigned i = 0; i < N; i++) {
        float dot = 0.f;
        for (unsigned j = 0; j < N; j++)
            dot += A[i][j] * x[j];
        w[i] += alpha * dot - N * beta;
    }
}

The way beta is used looks suspicious but that is for you to decide.

Now we vectorize. You didn't write a proper reduction loop. Here is how it should look like:

void routine2_vec(float alpha, float beta) {
    assert(N % 4 == 0);
    for (unsigned i = 0; i < N; i++) {
        __m128 dot4 = _mm_setzero_ps();
        for (unsigned j = 0; j < N; j += 4) {
            __m128 aij = _mm_loadu_ps(&A[i][j]);
            __m128 xj = _mm_loadu_ps(&x[j]);
            __m128 mres = _mm_mul_ps(aij, xj);
            dot4 = _mm_add_ps(dot4, mres);
        }
        // Reduce to two
        __m128 high = _mm_shuffle_ps(dot4, dot4, _MM_SHUFFLE(3, 3, 1, 1));
        dot4 = _mm_add_ps(dot4, high);
        // reduce to one
        high = _mm_movehl_ps(dot4, dot4);
        dot4 = _mm_add_ss(dot4, high);
        /*
         * There is no point in doing the last part with intrinsics,
         * just let the compiler handle it
         */
        float dot = _mm_cvtss_f32(dot4);
        w[i] += alpha * dot - N * beta;
    }
}

Discussions on the best way to do the reduction step can be found here: Fastest way to do horizontal SSE vector sum (or other reduction)

Since you are doing essentially a matrix-vector product, you can instead compute four values along the outer loop. This avoids choking on the loop-carried dependency chain through the accumulator as discussed here: Why does mulss take only 3 cycles on Haswell, different from Agner's instruction tables? (Unrolling FP loops with multiple accumulators)

static inline __m128 hadd(__m128 a, __m128 b) {
# ifdef __SSE3__
    return _mm_hadd_ps(a, b);
# else
    __m128 evens = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0,2,0));
    __m128 odds  = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1,3,1));
    return _mm_add_ps(evens, odds);
# endif
}

void routine2_vec(float alpha, float beta) {
    __m128 alpha2 = _mm_set1_ps(alpha);
    __m128 beta_n = _mm_set1_ps(N * beta);
    assert(N % 4 == 0);
    for (unsigned i = 0; i < N; i += 4) {
        __m128 dot4_1 = _mm_setzero_ps();
        __m128 dot4_2 = dot4_1, dot4_3 = dot4_1, dot4_4 = dot4_1;
        for (unsigned j = 0; j < N; j += 4) {
            __m128 aij = _mm_loadu_ps(&A[i][j]);
            __m128 xj = _mm_loadu_ps(&x[j]);
            __m128 mres = _mm_mul_ps(aij, xj);
            dot4_1 = _mm_add_ps(dot4_1, mres);

            aij = _mm_loadu_ps(&A[i + 1][j]);
            mres = _mm_mul_ps(aij, xj);
            dot4_2 = _mm_add_ps(dot4_2, mres);

            aij = _mm_loadu_ps(&A[i + 2][j]);
            mres = _mm_mul_ps(aij, xj);
            dot4_3 = _mm_add_ps(dot4_3, mres);

            aij = _mm_loadu_ps(&A[i + 3][j]);
            mres = _mm_mul_ps(aij, xj);
            dot4_4 = _mm_add_ps(dot4_4, mres);
        }
        /*
         * Reduce to one vector for 4 i-values
         *
         * Same as this, but faster:
         * _MM_TRANSPOSE4_PS(dot4_1, dot4_2, dot4_3, dot4_4);
         * dot4_1 = _mm_add_ps(dot4_1, dot4_2);
         * dot4_3 = _mm_add_ps(dot4_3, dot4_4);
         * dot4_1 = _mm_add_ps(dot4_1, dot4_3);
         */
        dot4_1 = hadd(dot4_1, dot4_2);
        dot4_3 = hadd(dot4_3, dot4_4);
        dot4_1 = hadd(dot4_1, dot4_3);

        dot4_1 = _mm_mul_ps(dot4_1, alpha2);
        dot4_1 = _mm_sub_ps(dot4_1, beta_n);
        __m128 wi = _mm_loadu_ps(&w[i]);
        wi = _mm_add_ps(wi, dot4_1);
        _mm_storeu_ps(&w[i], wi);
    }
}

Further improvements such as micro-optimizing the loop conditions or aligning one of the inputs can be found here. That also includes code for dealing with the tail of the array if the array size is not a multiple of 4.