gcc optimization vectorization intrinsics avx2

How to chain avx2 intrinsics efficiently to perform chain of arithmetic operations?

I wrote a large program to simulate molecular system. I ran it on a desktop computer whose processor is a Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz. Most of the time (75%) is used to calculate a Lennard Jones potential on 4 neighbours.

To optimize it, I am learning how to use avx2 intrinsics and I wrote a simplified version that performs only the LJ calculation : (d_eq/d_cur)**12 - 2.0*(d_eq/d_cur)**6

This is the code without the intrinsics :

#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>

double lj_pot(double* d_cur) {
    const double d_eq = 1.0;
    double ratio, vpot1, eie;
        
    eie =0.0;
    for (int i=0; i<4; i++) {
        ratio = d_eq/d_cur[i] ;
        vpot1 = ratio*ratio*ratio ;
        vpot1 = vpot1*vpot1 ;
        eie += vpot1*(vpot1-2.0);
    }
    
    return eie;
}

int main()
{
    double d_cur[4];
    double eee;
    
    eee=0.0;
    
    for (int i=0; i<10000000; i++) {
        for (int j=0; j<4; j++) {
            d_cur[j] = 0.5+(double)(rand()) / (double)(RAND_MAX); // between 0.5 , 1.5
        }
        eee += lj_pot(d_cur);
    }
    printf("%f\n",eee);
    return 0;
}

And this is the code with the AVX2 intrinsics

#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>

static inline double hsum_double_avx(__m256d v) {
// From https://stackoverflow.com/a/49943540/7462275
    __m128d vlow  = _mm256_castpd256_pd128(v);
    __m128d vhigh = _mm256_extractf128_pd(v, 1); // high 128
            vlow  = _mm_add_pd(vlow, vhigh);     // reduce down to 128

    __m128d high64 = _mm_unpackhi_pd(vlow, vlow);
    return  _mm_cvtsd_f64(_mm_add_sd(vlow, high64));  // reduce to scalar
}

double lj_pot(double* d_cur) {
    
    
    const __m256d d_eq = _mm256_set1_pd(1.0);
    const __m256d two_dbl = _mm256_set1_pd(2.0);
    __m256d vec1, vec2;
        
    vec1 = _mm256_loadu_pd(d_cur);
    vec1 = _mm256_div_pd(d_eq,vec1);
    vec2 = _mm256_mul_pd(vec1,vec1); // (d_eq/d_cur)**2
    vec1 = _mm256_mul_pd(vec2,vec2); // (d_eq/d_cur)**4
    vec1 = _mm256_mul_pd(vec1,vec2); // (d_eq/d_cur)**6
    vec2 = _mm256_sub_pd(vec1,two_dbl); // (d_eq/d_cur)**6 - 2.0
    vec1 = _mm256_mul_pd(vec1,vec2); // (d_eq/d_cur)**12 -2.0*(d_eq/d_cur)**6

    return hsum_double_avx(vec1);
}

int main()
{
    double d_cur[4];
    double eee;
    
    eee=0.0;
    

    
    for (int i=0; i<10000000; i++) {
        for (int j=0; j<4; j++) {
            d_cur[j] = 0.5+(double)(rand()) / (double)(RAND_MAX); // between 0.5 , 1.5
        }
        eee += lj_pot(d_cur);
    }
    printf("%f\n",eee);
    return 0;
}

This code output the same result than the first one. But it is slower. (gcc -O3 -mavx -o lj_pot lj_pot.c is used to compile both programs.)

When I inspect the assembly code, I suppose (perhaps it is not the good reason) I do not use the AVX2 intrinsics correctly to perform chain of arithmetic operations. What must I change to have a program faster ?

Thanks for answer

Solution

GCC automatically uses SIMD for your original code. Your hand-optimization is just inhibiting the compiler's optimization. Check out the resulting assembly with -O3 -march=skylake -funsafe-math-optimizations:

lj_pot(double*):
    vbroadcastsd    ymm1, QWORD PTR .LC1[rip]
    vdivpd  ymm1, ymm1, YMMWORD PTR [rdi]
    vmulpd  ymm0, ymm1, ymm1
    vmulpd  ymm0, ymm0, ymm1
    vbroadcastsd    ymm1, QWORD PTR .LC3[rip]
    vmulpd  ymm0, ymm0, ymm0
    vaddpd  ymm1, ymm0, ymm1
    vmulpd  ymm0, ymm1, ymm0
    vextractf128    xmm1, ymm0, 0x1
    vaddpd  xmm1, xmm1, xmm0
    vunpckhpd       xmm0, xmm1, xmm1
    vaddpd  xmm0, xmm0, xmm1
    vzeroupper
    ret

If you want to optimize this, start by looking at what the compiler is doing. It is pretty smart.