There is a following piece of code:
#include <iostream>
#include <fstream>
#include <limits>
#include <chrono>
#include <omp.h>
const long int N=1000000000L;
class Counter {
public:
Counter():n(0),N(0){}
void operator()()
{
if(n==INT_MAX)
{
#pragma omp atomic update
++N;
#pragma omp atomic write
n=1;
}
else
{
#pragma omp atomic update
++n;
}
}
long int GetTotalCount()
{
return (static_cast<long int>(n)+static_cast<long int>(N)*static_cast<long int>(INT_MAX));
}
friend std::ostream & operator<<(std::ostream & o, Counter & counter)
{
o<<counter.GetTotalCount()<<" ("<<counter.N<<","<<counter.n<<") = "
<<static_cast<long int>(counter.N)*static_cast<long int>(counter.INT_MAX)+static_cast<long int>
(counter.n)
<<std::endl;
return o;
}
private:
const int INT_MAX=std::numeric_limits<int>::max();
int n;
int N;
};
int main(int argc, char **argv)
{
const auto begin = std::chrono::steady_clock::now();
Counter counter;
#pragma omp parallel for simd
for(long int i=0; i<N; ++i)
{
if(i%2==0) counter();
}
const auto end = std::chrono::steady_clock::now();
std::cout<<"N="<<N<<std::endl;
std::cout<<"Total count="<<counter;
auto time = std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count();
std::cout<<"t="<<time<<" ms"<<std::endl;
return 0;
}
It works properly with all N <=1000000000 and gives the following output:
N=1000000000
Total count=500000000 (0,500000000) = 500000000
t=11297 ms
But if to make N 10 times bigger, the output is incorrect (2nd line):
N=10000000000
Total count=705032704 (0,705032704) = 705032704
t=112660 ms
The second line here must be
Total count=500000000 (2,500000000) = 705032706
I can't understand why the program works incorrectly at N=1e10, although N is long int.
You have a race condition in the check:
if(n==INT_MAX)
{
// Nothing prevents another thread to read n here and enter the same branch
#pragma omp atomic update
++N;
#pragma omp atomic write
n=1;
}
Hence, two threads may both decide to reset n
and increment N
concurrently.
Besides that, you would also have to read n
atomically for the if check itself, though that is not sufficient alone.
Overall, you are better off using an actual OpenMP reduction or a custom built reduction and also sufficiently large datatypes that support actual atomic operations.