I wrote a large program to simulate molecular system. I ran it on a desktop computer whose processor is a Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz. Most of the time (75%) is used to calculate a Lennard Jones potential on 4 neighbours.
To optimize it, I am learning how to use avx2 intrinsics
and I wrote a simplified version that performs only the LJ calculation : (d_eq/d_cur)**12 - 2.0*(d_eq/d_cur)**6
This is the code without the intrinsics :
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
double lj_pot(double* d_cur) {
const double d_eq = 1.0;
double ratio, vpot1, eie;
eie =0.0;
for (int i=0; i<4; i++) {
ratio = d_eq/d_cur[i] ;
vpot1 = ratio*ratio*ratio ;
vpot1 = vpot1*vpot1 ;
eie += vpot1*(vpot1-2.0);
}
return eie;
}
int main()
{
double d_cur[4];
double eee;
eee=0.0;
for (int i=0; i<10000000; i++) {
for (int j=0; j<4; j++) {
d_cur[j] = 0.5+(double)(rand()) / (double)(RAND_MAX); // between 0.5 , 1.5
}
eee += lj_pot(d_cur);
}
printf("%f\n",eee);
return 0;
}
And this is the code with the AVX2 intrinsics
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
static inline double hsum_double_avx(__m256d v) {
// From https://stackoverflow.com/a/49943540/7462275
__m128d vlow = _mm256_castpd256_pd128(v);
__m128d vhigh = _mm256_extractf128_pd(v, 1); // high 128
vlow = _mm_add_pd(vlow, vhigh); // reduce down to 128
__m128d high64 = _mm_unpackhi_pd(vlow, vlow);
return _mm_cvtsd_f64(_mm_add_sd(vlow, high64)); // reduce to scalar
}
double lj_pot(double* d_cur) {
const __m256d d_eq = _mm256_set1_pd(1.0);
const __m256d two_dbl = _mm256_set1_pd(2.0);
__m256d vec1, vec2;
vec1 = _mm256_loadu_pd(d_cur);
vec1 = _mm256_div_pd(d_eq,vec1);
vec2 = _mm256_mul_pd(vec1,vec1); // (d_eq/d_cur)**2
vec1 = _mm256_mul_pd(vec2,vec2); // (d_eq/d_cur)**4
vec1 = _mm256_mul_pd(vec1,vec2); // (d_eq/d_cur)**6
vec2 = _mm256_sub_pd(vec1,two_dbl); // (d_eq/d_cur)**6 - 2.0
vec1 = _mm256_mul_pd(vec1,vec2); // (d_eq/d_cur)**12 -2.0*(d_eq/d_cur)**6
return hsum_double_avx(vec1);
}
int main()
{
double d_cur[4];
double eee;
eee=0.0;
for (int i=0; i<10000000; i++) {
for (int j=0; j<4; j++) {
d_cur[j] = 0.5+(double)(rand()) / (double)(RAND_MAX); // between 0.5 , 1.5
}
eee += lj_pot(d_cur);
}
printf("%f\n",eee);
return 0;
}
This code output the same result than the first one. But it is slower.
(gcc -O3 -mavx -o lj_pot lj_pot.c
is used to compile both programs.)
When I inspect the assembly code, I suppose (perhaps it is not the good reason) I do not use the AVX2 intrinsics correctly to perform chain of arithmetic operations. What must I change to have a program faster ?
Thanks for answer
GCC automatically uses SIMD for your original code. Your hand-optimization is just inhibiting the compiler's optimization. Check out the resulting assembly with -O3 -march=skylake -funsafe-math-optimizations
:
lj_pot(double*):
vbroadcastsd ymm1, QWORD PTR .LC1[rip]
vdivpd ymm1, ymm1, YMMWORD PTR [rdi]
vmulpd ymm0, ymm1, ymm1
vmulpd ymm0, ymm0, ymm1
vbroadcastsd ymm1, QWORD PTR .LC3[rip]
vmulpd ymm0, ymm0, ymm0
vaddpd ymm1, ymm0, ymm1
vmulpd ymm0, ymm1, ymm0
vextractf128 xmm1, ymm0, 0x1
vaddpd xmm1, xmm1, xmm0
vunpckhpd xmm0, xmm1, xmm1
vaddpd xmm0, xmm0, xmm1
vzeroupper
ret
If you want to optimize this, start by looking at what the compiler is doing. It is pretty smart.