First method (parallelize inner loop):
for(j=0; j<LATTICE_VW; ++j) {
x = j*DX + LATTICE_W;
#pragma omp parallel for ordered private(y, prob)
for(i=0; i<LATTICE_VH; ++i) {
y = i*DY + LATTICE_S;
prob = psi[i][j].norm();
#pragma omp ordered
out << x << " " << y << " " << prob << endl;
}
}
Second method (parallelize outer loop):
#pragma omp parallel for ordered private(x, y, prob)
for(j=0; j<LATTICE_VW; ++j) {
x = j*DX + LATTICE_W;
for(i=0; i<LATTICE_VH; ++i) {
y = i*DY + LATTICE_S;
prob = psi[i][j].norm();
#pragma omp ordered
out << x << " " << y << " " << prob << endl;
}
}
Third method (parallelize collapsed loops)
#pragma omp parallel for collapse(2) ordered private(x, y, prob)
for(j=0; j<LATTICE_VW; ++j) {
for(i=0; i<LATTICE_VH; ++i) {
x = j*DX + LATTICE_W;
y = i*DY + LATTICE_S;
prob = psi[i][j].norm();
#pragma omp ordered
out << x << " " << y << " " << prob << endl;
}
}
If I was going to guess I would say that method 3 should be the fastest.
However method 1 is the fastest, while both the second and third take about the same ammount of time as if there was no parallelization. Why does this happens?
Look with this:
for(int x = 0; x < 4; ++x)
#pragma omp parallel for ordered
for(int y = 0; y < 4; ++y)
#pragma omp ordered
cout << x << ',' << y << " (by thread " << omp_get_thread_num() << ')' << endl;
you have:
0,0 (by thread 0)
0,1 (by thread 1)
0,2 (by thread 2)
0,3 (by thread 3)
1,0 (by thread 0)
1,1 (by thread 1)
1,2 (by thread 2)
1,3 (by thread 3)
Each thread just has to wait for some cout
all the work before can be done in parallel.
But with:
#pragma omp parallel for ordered
for(int x = 0; x < 4; ++x)
for(int y = 0; y < 4; ++y)
#pragma omp ordered
cout << x << ',' << y << " (by thread " << omp_get_thread_num() << ')' << endl;
and
#pragma omp parallel for collapse(2) ordered
for(int x = 0; x < 4; ++x)
for(int y = 0; y < 4; ++y)
#pragma omp ordered
cout << x << ',' << y << " (by thread " << omp_get_thread_num() << ')' << endl;
the situations is:
0,0 (by thread 0)
0,1 (by thread 0)
0,2 (by thread 0)
0,3 (by thread 0)
1,0 (by thread 1)
1,1 (by thread 1)
1,2 (by thread 1)
1,3 (by thread 1)
2,0 (by thread 2)
2,1 (by thread 2)
2,2 (by thread 2)
2,3 (by thread 2)
3,0 (by thread 3)
3,1 (by thread 3)
3,2 (by thread 3)
3,3 (by thread 3)
So thread 1
has to wait for thread 0
to finish all its work, before it can cout
the first time, and nearly nothing can be done in parallel.
Try adding schedule(static,1)
to the collapse-version and it should perform at least as good as the first version does.