Parallel exection using OpenMP takes longer than serial execution in C?

The serial version takes less time than the parallel one.

/*Serial Version*/
double start = omp_get_wtime();

for (i = 0; i < 1100; i++) {
    for (j = i; j < i + 4; j++) {
        fprintf(new_file, "%f  ", S[j]);
    }
    fprintf(new_file, "\n");
    m = compute_m(S + i, 4);
    find_min_max(S + i, 4, &min, &max);

    S_i = inf(m, min, b); 
    S_s = sup(m, max, b); 

    if (S[i + 2] < S_i)
        Res[i] = S_i;
    else if (S[i + 2] > S_s)
        Res[i] = S_s;
    else
        Res[i] = ECG[i + 2];
    fprintf(output_f, "%f\n", Res[i]);
}

    

    double end = omp_get_wtime();
    printf("\n ------------- TIMING :: Serial Version -------------- ");
    printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);

#Parallel version 
    double start = omp_get_wtime();
#pragma omp parallel for
    for (i = 0; i < 1100; i++) {
#pragma omp parallel for
        for (j = i; j < i + 4; j++) {
            serial code ...
        }
        serial code ...
    }
double end = omp_get_wtime();
    printf("\n ------------- TIMING :: Serial Version -------------- ");
    printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);

I have tried multiple times, the serial execution is always faster why?

why is serial execution faster here? am I calculation the execution time in the right way?

Solution

Assuming that compute_m does not write to S and that find_min_max does not write to S_i or read from min and max, this should work.

/*Parallel Version A*/
double start = omp_get_wtime();

const int nThreads = omp_get_max_threads();

#pragma omp parallel sections num_threads(2) default(none) shared(S, Res, ECG, b, min, max, m, S_i, S_s, nThreads)
{
#pragma omp section
    for (i = 0; i < 1100; i++) {
        for (j = i; j < i + 4; j++) {
            fprintf(new_file, "%f  ", S[j]);
        }
        fprintf(new_file, "\n");
    }
#pragma omp section
    {
#pragma omp parallel for num_threads(nThreads - 1) default(none) shared(S, Res, ECG, b) private(min, max, m, S_i, S_s)
        for (i = 0; i < 1100; i++) {
            m = compute_m(S + i, 4);
            find_min_max(S + i, 4, &min, &max);

            S_i = inf(m, min, b); 
            S_s = sup(m, max, b); 

            if (S[i + 2] < S_i)
                Res[i] = S_i;
            else if (S[i + 2] > S_s)
                Res[i] = S_s;
            else
                Res[i] = ECG[i + 2];
        }
        for (i = 0; i < 1100; i++) {
            fprintf(output_f, "%f\n", Res[i]);
        }
    }
}

double end = omp_get_wtime();
printf("\n ------------- TIMING :: Parallel Version A -------------- ");
printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);

Another a bit less complicated solution would be this one

/*Parallel Version B*/
double start = omp_get_wtime();

#pragma omp parallel default(none) shared(S, Res, ECG, b) private(min, max, m, S_i, S_s)
{
#pragma omp for 
    for (i = 0; i < 1100; i++) {
        m = compute_m(S + i, 4);
        find_min_max(S + i, 4, &min, &max);

        S_i = inf(m, min, b); 
        S_s = sup(m, max, b); 

        if (S[i + 2] < S_i)
            Res[i] = S_i;
        else if (S[i + 2] > S_s)
            Res[i] = S_s;
        else
            Res[i] = ECG[i + 2];
    }

#pragma omp sections
    {
#pragma omp section
        for (i = 0; i < 1100; i++) {
            for (j = i; j < i + 4; j++) {
                fprintf(new_file, "%f  ", S[j]);
            }
            fprintf(new_file, "\n");
        }
#pragma omp section
        for (i = 0; i < 1100; i++) {
            fprintf(output_f, "%f\n", Res[i]);
        }
    }
}

double end = omp_get_wtime();
printf("\n ------------- TIMING :: Parallel Version B -------------- ");
printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);

In the first version the calculation happens in parallel with writing out S, in the second version the calculations happen first, before S and Res are written to file in parallel. I wouldn't bet on which one is faster, so just try it out on your hardware.

These can still be slower than the serial version, because spawning threads always has some overhead.