Search code examples
c++openmpatomicpragmasingle-threaded

Performance issues of #pragma omp atomic with OMP_NUM_THREADS=1


I have observed an unexpected (for me!) behavior of an openmp code which I am writing. The code structure is the following:

#pragma omp parallel for
for(int i=0;i<N;i++){ 
 // lots of calculations that produce 3 integers i1,i2,i3 and 3 doubles d1,d2,d3 
 #pragma omp atomic 
 J1[i1] += d1;
 #pragma omp atomic
 J2[i2] += d2; 
 #pragma omp atomic
 J3[i3] += d3; 
}

I have compiled three different versions of this code:

1) with openmp (-fopenmp)

2) without openmp

3) with openmp, but without the 3 atomic operations (just as a test, since atomic operations are necessary)

When I run version 1) with environment variable OMP_NUM_THREADS=1, I observe a significant slowdown with respect to version 2); while version 3) runs as fast as version 2).

I would like to know the reason for this behavior (why do atomic operations slow the code down even if single-threded?!) and if it is possible to compile/rewrite the code in such a way that version 1) runs as fast as version 2).

I attach at the end of the question a working example which shows the aforementioned behavior. I compiled 1) with:

g++ -fopenmp -o toy_code toy_code.cpp -std=c++11 -O3

2) with:

g++ -o toy_code_NO_OMP toy_code.cpp -std=c++11 -O3

and 3) with:

g++ -fopenmp -o toy_code_NO_ATOMIC toy_code_NO_ATOMIC.cpp -std=c++11 -O3

The version of the compiler is gcc version 5.3.1 20160519 (Debian 5.3.1-20). The execution time of the 3 versions is:

1) 1 min 24 sec

2) 51 sec

3) 51 sec

Thanks in advance for any advice!

// toy_code.cpp 
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <cmath>
#include <omp.h>
#define Np 1000000
#define N 1000

int main (){
        double* Xp, *Yp, *J,*Jb;
        Xp = new double[Np];
        Yp = new double[Np];  
        J = new double [N*N];
        Jb = new double [N*N];

        for(int i=0;i<N*N;i++){
            J[i]=0.0;
            Jb[i]=0.0;
        }

        for(int i=0;i<Np;i++){
            Xp[i] = rand()*1.0/RAND_MAX - 0.5;
            Yp[i] = rand()*1.0/RAND_MAX - 0.5;
        }

        for(int n=0; n<2000; n++){
        #pragma omp parallel for
        for(int p=0;p<Np;p++){
            double rx = (Xp[p]+0.5)*(N-1);
            double ry = (Yp[p]+0.5)*(N-1);
            int xindex = (int)floor(rx+0.5);
            int yindex = (int)floor(ry+0.5);
            int k;
            k=xindex*N+yindex;

            #pragma omp atomic
            J[k]+=1;
            #pragma omp atomic
            Jb[k]+=1;
         }
         }

        delete[] Xp;
        delete[] Yp;
        delete[] J;
        delete[] Jb;

return 0;
}

Solution

  • If you enable OpenMP, gcc has to generate different code that works for any number of threads that is only known at runtime.

    In this particular case take a look at output of gcc -S (slightly shortened by lables).

    Without OpenMP:

    .loc 1 38 0 discriminator 2  # Line 38 is J[k]+=1;
    movsd   8(%rsp), %xmm1
    cvttsd2si   %xmm0, %edx
    cvttsd2si   %xmm1, %eax
    movsd   .LC3(%rip), %xmm0
    imull   $1000, %eax, %eax
    addl    %edx, %eax
    cltq
    salq    $3, %rax
    leaq    0(%r13,%rax), %rdx
    .loc 1 40 0 discriminator 2   # Line 40 is Jb[k]+=1;
    addq    %r12, %rax
    .loc 1 29 0 discriminator 2
    cmpq    $8000000, %r15
    .loc 1 38 0 discriminator 2
    addsd   (%rdx), %xmm0
    movsd   %xmm0, (%rdx)
    .loc 1 40 0 discriminator 2
    movsd   .LC3(%rip), %xmm0
    addsd   (%rax), %xmm0
    movsd   %xmm0, (%rax)
    

    The loop is unrolled making this rather complicated.

    With -fopenmp:

    movsd   (%rsp), %xmm2
    cvttsd2si   %xmm0, %eax
    cvttsd2si   %xmm2, %ecx
    imull   $1000, %ecx, %ecx
    addl    %eax, %ecx
    movslq  %ecx, %rcx
    salq    $3, %rcx
    movq    %rcx, %rsi
    addq    16(%rbp), %rsi
    movq    (%rsi), %rdx
    movsd   8(%rsp), %xmm1
    jmp .L4
    movq    %rax, %rdx
    movq    %rdx, (%rsp)
    movq    %rdx, %rax
    movsd   (%rsp), %xmm3
    addsd   %xmm1, %xmm3
    movq    %xmm3, %rdi
    lock cmpxchgq   %rdi, (%rsi)
    cmpq    %rax, %rdx
    jne .L9
    .loc 1 40 0
    addq    24(%rbp), %rcx
    movq    (%rcx), %rdx
    jmp .L5
    .p2align 4,,10
    .p2align 3
    movq    %rax, %rdx
    movq    %rdx, (%rsp)
    movq    %rdx, %rax
    movsd   (%rsp), %xmm4
    addsd   %xmm1, %xmm4
    movq    %xmm4, %rsi
    lock cmpxchgq   %rsi, (%rcx)
    cmpq    %rax, %rdx
    jne .L10
    addq    $8, %r12
    cmpq    %r12, %rbx
    jne .L6
    

    I'm not going to try to explain or understand all the details of what is happening here, but thats not necessary for the message: The compiler has to use different atomic instructions that are likely more costly, especially lock cmpxchgq.

    Besides this fundamental issue, OpenMP may mess with the optimizer in any imaginable way, e.g. interfer with unrolling. I've also seen a curious case where the intel compiler actually generates more efficient serial code for an OpenMP loop.

    P.S. Consider yourself lucky - it could be much worse. If the compiler cannot map the atomic instruction to a hardware instruction, it has to use locks which would be even slower.