Performance issues of #pragma omp atomic with OMP_NUM_THREADS=1

I have observed an unexpected (for me!) behavior of an openmp code which I am writing. The code structure is the following:

#pragma omp parallel for
for(int i=0;i<N;i++){ 
 // lots of calculations that produce 3 integers i1,i2,i3 and 3 doubles d1,d2,d3 
 #pragma omp atomic 
 J1[i1] += d1;
 #pragma omp atomic
 J2[i2] += d2; 
 #pragma omp atomic
 J3[i3] += d3; 
}

I have compiled three different versions of this code:

1) with openmp (-fopenmp)

2) without openmp

3) with openmp, but without the 3 atomic operations (just as a test, since atomic operations are necessary)

When I run version 1) with environment variable OMP_NUM_THREADS=1, I observe a significant slowdown with respect to version 2); while version 3) runs as fast as version 2).

I would like to know the reason for this behavior (why do atomic operations slow the code down even if single-threded?!) and if it is possible to compile/rewrite the code in such a way that version 1) runs as fast as version 2).

I attach at the end of the question a working example which shows the aforementioned behavior. I compiled 1) with:

g++ -fopenmp -o toy_code toy_code.cpp -std=c++11 -O3

2) with:

g++ -o toy_code_NO_OMP toy_code.cpp -std=c++11 -O3

and 3) with:

g++ -fopenmp -o toy_code_NO_ATOMIC toy_code_NO_ATOMIC.cpp -std=c++11 -O3

The version of the compiler is gcc version 5.3.1 20160519 (Debian 5.3.1-20). The execution time of the 3 versions is:

1) 1 min 24 sec

2) 51 sec

3) 51 sec

Thanks in advance for any advice!

// toy_code.cpp 
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <cmath>
#include <omp.h>
#define Np 1000000
#define N 1000

int main (){
        double* Xp, *Yp, *J,*Jb;
        Xp = new double[Np];
        Yp = new double[Np];  
        J = new double [N*N];
        Jb = new double [N*N];

        for(int i=0;i<N*N;i++){
            J[i]=0.0;
            Jb[i]=0.0;
        }

        for(int i=0;i<Np;i++){
            Xp[i] = rand()*1.0/RAND_MAX - 0.5;
            Yp[i] = rand()*1.0/RAND_MAX - 0.5;
        }

        for(int n=0; n<2000; n++){
        #pragma omp parallel for
        for(int p=0;p<Np;p++){
            double rx = (Xp[p]+0.5)*(N-1);
            double ry = (Yp[p]+0.5)*(N-1);
            int xindex = (int)floor(rx+0.5);
            int yindex = (int)floor(ry+0.5);
            int k;
            k=xindex*N+yindex;

            #pragma omp atomic
            J[k]+=1;
            #pragma omp atomic
            Jb[k]+=1;
         }
         }

        delete[] Xp;
        delete[] Yp;
        delete[] J;
        delete[] Jb;

return 0;
}

Solution

If you enable OpenMP, gcc has to generate different code that works for any number of threads that is only known at runtime.

In this particular case take a look at output of gcc -S (slightly shortened by lables).

Without OpenMP:

.loc 1 38 0 discriminator 2  # Line 38 is J[k]+=1;
movsd   8(%rsp), %xmm1
cvttsd2si   %xmm0, %edx
cvttsd2si   %xmm1, %eax
movsd   .LC3(%rip), %xmm0
imull   $1000, %eax, %eax
addl    %edx, %eax
cltq
salq    $3, %rax
leaq    0(%r13,%rax), %rdx
.loc 1 40 0 discriminator 2   # Line 40 is Jb[k]+=1;
addq    %r12, %rax
.loc 1 29 0 discriminator 2
cmpq    $8000000, %r15
.loc 1 38 0 discriminator 2
addsd   (%rdx), %xmm0
movsd   %xmm0, (%rdx)
.loc 1 40 0 discriminator 2
movsd   .LC3(%rip), %xmm0
addsd   (%rax), %xmm0
movsd   %xmm0, (%rax)

The loop is unrolled making this rather complicated.

With -fopenmp:

movsd   (%rsp), %xmm2
cvttsd2si   %xmm0, %eax
cvttsd2si   %xmm2, %ecx
imull   $1000, %ecx, %ecx
addl    %eax, %ecx
movslq  %ecx, %rcx
salq    $3, %rcx
movq    %rcx, %rsi
addq    16(%rbp), %rsi
movq    (%rsi), %rdx
movsd   8(%rsp), %xmm1
jmp .L4
movq    %rax, %rdx
movq    %rdx, (%rsp)
movq    %rdx, %rax
movsd   (%rsp), %xmm3
addsd   %xmm1, %xmm3
movq    %xmm3, %rdi
lock cmpxchgq   %rdi, (%rsi)
cmpq    %rax, %rdx
jne .L9
.loc 1 40 0
addq    24(%rbp), %rcx
movq    (%rcx), %rdx
jmp .L5
.p2align 4,,10
.p2align 3
movq    %rax, %rdx
movq    %rdx, (%rsp)
movq    %rdx, %rax
movsd   (%rsp), %xmm4
addsd   %xmm1, %xmm4
movq    %xmm4, %rsi
lock cmpxchgq   %rsi, (%rcx)
cmpq    %rax, %rdx
jne .L10
addq    $8, %r12
cmpq    %r12, %rbx
jne .L6

I'm not going to try to explain or understand all the details of what is happening here, but thats not necessary for the message: The compiler has to use different atomic instructions that are likely more costly, especially lock cmpxchgq.

Besides this fundamental issue, OpenMP may mess with the optimizer in any imaginable way, e.g. interfer with unrolling. I've also seen a curious case where the intel compiler actually generates more efficient serial code for an OpenMP loop.

P.S. Consider yourself lucky - it could be much worse. If the compiler cannot map the atomic instruction to a hardware instruction, it has to use locks which would be even slower.