I was trying to solve this openmp tutorial exercise in which I was required to parallelize the following serial code:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
I came up with the following solution for this exercise. First I parallelizes both the producer and the consumer using #pragma omp parallel for
and then I added a barrier and a flush between the calls to the functions. Here is my code:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 1000000000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
#pragma omp barrier
#pragma omp flush
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
However there is a race condition in this code as when I run it multiple times, I get slightly different values. The solution to this problem, as per the tutorial, uses the producer-consumer pattern. It first runs the fill_rand
function. Then after that function finishes its execution, the code sets up a flag variable to instruct the consumer to start executing. Of course, it also adds flushes between the two sections of producer and consumer. To me this code looks similar to my solution. As far as I understand it, both pieces of code first run the producer, the flush the array to the memory, and then run the consumer to get the result. My code, however runs twice as fast, probably due to less cache flushes. But, my code seems to have some race conditions. Here is the provided solution which does not have any race conditions:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 1000000000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
#pragma omp parallel sections
{
#pragma omp section
{
fill_rand(N, A); // Producer: fill an array of data
#pragma omp flush
#pragma omp atomic write
flag = 1;
#pragma omp flush(flag)
}
#pragma omp section
{
#pragma omp flush(flag)
while(1)
{
#pragma omp flush(flag)
#pragma omp atomic read
__flag = flag;
if(__flag == 1) break;
}
#pragma omp flush
sum = Sum_array(N, A); // Consumer: sum the array
}
runtime = omp_get_wtime() - runtime;
}
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
I can't figure out what I am doing wrong. Can someone please help me out.
I also tried running the following code which adds the barrier and the flush in the parallel region, but even this version has a race condition somewhere.
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel
{
#pragma omp for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
#pragma omp barrier
#pragma omp flush
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
Update:
As suggested by some comments, the problem was with the global variable randy
which was being treated as a shared variable by default by openmp. So I changed that variable to firstprivate and now the run to run variation is no longer there. I suppose that the randy
variable was the source of the race condition. However, after making this change in both the provided solution and my own solution, I am getting different answers from both the programs. Note that there is no run to run variation, but the answers that I am getting from both the versions is different. Moreover, the answer that I get when I run my version changes depending on the number of threads that I use. Again, I am not sure why this is happening.
Here is the solution provided by the tutorial which results in the same answer regardless of the number of threads:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel for schedule(static) firstprivate(randy)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
#pragma omp parallel sections
{
#pragma omp section
{
fill_rand(N, A); // Producer: fill an array of data
#pragma omp flush
#pragma omp atomic write
flag = 1;
#pragma omp flush(flag)
}
#pragma omp section
{
#pragma omp flush(flag)
while(1)
{
#pragma omp flush(flag)
#pragma omp atomic read
__flag = flag;
if(__flag == 1) break;
}
#pragma omp flush
sum = Sum_array(N, A); // Consumer: sum the array
}
runtime = omp_get_wtime() - runtime;
}
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
And here is my solution which results in the different answers depending on the number of threads:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel
{
#pragma omp for schedule(static) firstprivate(randy)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
#pragma omp barrier
#pragma omp flush
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
Update #2:
The problem was indeed with the variable randy
. Refer to my answer below for more information. Thanks!
As suggested by the comments, the problem was with the variable randy
. Declaring it as threadprivate solved the issue. Thanks for the help everyone! For the sake of completion, here is the working code:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
#pragma omp threadprivate(randy)
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}