The serial version takes less time than the parallel one.
/*Serial Version*/
double start = omp_get_wtime();
for (i = 0; i < 1100; i++) {
for (j = i; j < i + 4; j++) {
fprintf(new_file, "%f ", S[j]);
}
fprintf(new_file, "\n");
m = compute_m(S + i, 4);
find_min_max(S + i, 4, &min, &max);
S_i = inf(m, min, b);
S_s = sup(m, max, b);
if (S[i + 2] < S_i)
Res[i] = S_i;
else if (S[i + 2] > S_s)
Res[i] = S_s;
else
Res[i] = ECG[i + 2];
fprintf(output_f, "%f\n", Res[i]);
}
double end = omp_get_wtime();
printf("\n ------------- TIMING :: Serial Version -------------- ");
printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);
#Parallel version
double start = omp_get_wtime();
#pragma omp parallel for
for (i = 0; i < 1100; i++) {
#pragma omp parallel for
for (j = i; j < i + 4; j++) {
serial code ...
}
serial code ...
}
double end = omp_get_wtime();
printf("\n ------------- TIMING :: Serial Version -------------- ");
printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);
I have tried multiple times, the serial execution is always faster why?
why is serial execution faster here? am I calculation the execution time in the right way?
Assuming that compute_m does not write to S and that find_min_max does not write to S_i or read from min and max, this should work.
/*Parallel Version A*/
double start = omp_get_wtime();
const int nThreads = omp_get_max_threads();
#pragma omp parallel sections num_threads(2) default(none) shared(S, Res, ECG, b, min, max, m, S_i, S_s, nThreads)
{
#pragma omp section
for (i = 0; i < 1100; i++) {
for (j = i; j < i + 4; j++) {
fprintf(new_file, "%f ", S[j]);
}
fprintf(new_file, "\n");
}
#pragma omp section
{
#pragma omp parallel for num_threads(nThreads - 1) default(none) shared(S, Res, ECG, b) private(min, max, m, S_i, S_s)
for (i = 0; i < 1100; i++) {
m = compute_m(S + i, 4);
find_min_max(S + i, 4, &min, &max);
S_i = inf(m, min, b);
S_s = sup(m, max, b);
if (S[i + 2] < S_i)
Res[i] = S_i;
else if (S[i + 2] > S_s)
Res[i] = S_s;
else
Res[i] = ECG[i + 2];
}
for (i = 0; i < 1100; i++) {
fprintf(output_f, "%f\n", Res[i]);
}
}
}
double end = omp_get_wtime();
printf("\n ------------- TIMING :: Parallel Version A -------------- ");
printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);
Another a bit less complicated solution would be this one
/*Parallel Version B*/
double start = omp_get_wtime();
#pragma omp parallel default(none) shared(S, Res, ECG, b) private(min, max, m, S_i, S_s)
{
#pragma omp for
for (i = 0; i < 1100; i++) {
m = compute_m(S + i, 4);
find_min_max(S + i, 4, &min, &max);
S_i = inf(m, min, b);
S_s = sup(m, max, b);
if (S[i + 2] < S_i)
Res[i] = S_i;
else if (S[i + 2] > S_s)
Res[i] = S_s;
else
Res[i] = ECG[i + 2];
}
#pragma omp sections
{
#pragma omp section
for (i = 0; i < 1100; i++) {
for (j = i; j < i + 4; j++) {
fprintf(new_file, "%f ", S[j]);
}
fprintf(new_file, "\n");
}
#pragma omp section
for (i = 0; i < 1100; i++) {
fprintf(output_f, "%f\n", Res[i]);
}
}
}
double end = omp_get_wtime();
printf("\n ------------- TIMING :: Parallel Version B -------------- ");
printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);
In the first version the calculation happens in parallel with writing out S, in the second version the calculations happen first, before S and Res are written to file in parallel. I wouldn't bet on which one is faster, so just try it out on your hardware.
These can still be slower than the serial version, because spawning threads always has some overhead.