Search code examples
cparallel-processingopenmp

Parallel exection using OpenMP takes longer than serial execution in C?


The serial version takes less time than the parallel one.

/*Serial Version*/
double start = omp_get_wtime();

for (i = 0; i < 1100; i++) {
    for (j = i; j < i + 4; j++) {
        fprintf(new_file, "%f  ", S[j]);
    }
    fprintf(new_file, "\n");
    m = compute_m(S + i, 4);
    find_min_max(S + i, 4, &min, &max);

    S_i = inf(m, min, b); 
    S_s = sup(m, max, b); 

    if (S[i + 2] < S_i)
        Res[i] = S_i;
    else if (S[i + 2] > S_s)
        Res[i] = S_s;
    else
        Res[i] = ECG[i + 2];
    fprintf(output_f, "%f\n", Res[i]);
}

    

    double end = omp_get_wtime();
    printf("\n ------------- TIMING :: Serial Version -------------- ");
    printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);

enter image description here

#Parallel version 
    double start = omp_get_wtime();
#pragma omp parallel for
    for (i = 0; i < 1100; i++) {
#pragma omp parallel for
        for (j = i; j < i + 4; j++) {
            serial code ...
        }
        serial code ...
    }
double end = omp_get_wtime();
    printf("\n ------------- TIMING :: Serial Version -------------- ");
    printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);

enter image description here

I have tried multiple times, the serial execution is always faster why?

why is serial execution faster here? am I calculation the execution time in the right way?


Solution

  • Assuming that compute_m does not write to S and that find_min_max does not write to S_i or read from min and max, this should work.

    /*Parallel Version A*/
    double start = omp_get_wtime();
    
    const int nThreads = omp_get_max_threads();
    
    #pragma omp parallel sections num_threads(2) default(none) shared(S, Res, ECG, b, min, max, m, S_i, S_s, nThreads)
    {
    #pragma omp section
        for (i = 0; i < 1100; i++) {
            for (j = i; j < i + 4; j++) {
                fprintf(new_file, "%f  ", S[j]);
            }
            fprintf(new_file, "\n");
        }
    #pragma omp section
        {
    #pragma omp parallel for num_threads(nThreads - 1) default(none) shared(S, Res, ECG, b) private(min, max, m, S_i, S_s)
            for (i = 0; i < 1100; i++) {
                m = compute_m(S + i, 4);
                find_min_max(S + i, 4, &min, &max);
    
                S_i = inf(m, min, b); 
                S_s = sup(m, max, b); 
    
                if (S[i + 2] < S_i)
                    Res[i] = S_i;
                else if (S[i + 2] > S_s)
                    Res[i] = S_s;
                else
                    Res[i] = ECG[i + 2];
            }
            for (i = 0; i < 1100; i++) {
                fprintf(output_f, "%f\n", Res[i]);
            }
        }
    }
    
    double end = omp_get_wtime();
    printf("\n ------------- TIMING :: Parallel Version A -------------- ");
    printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);
    

    Another a bit less complicated solution would be this one

    /*Parallel Version B*/
    double start = omp_get_wtime();
    
    #pragma omp parallel default(none) shared(S, Res, ECG, b) private(min, max, m, S_i, S_s)
    {
    #pragma omp for 
        for (i = 0; i < 1100; i++) {
            m = compute_m(S + i, 4);
            find_min_max(S + i, 4, &min, &max);
    
            S_i = inf(m, min, b); 
            S_s = sup(m, max, b); 
    
            if (S[i + 2] < S_i)
                Res[i] = S_i;
            else if (S[i + 2] > S_s)
                Res[i] = S_s;
            else
                Res[i] = ECG[i + 2];
        }
    
    #pragma omp sections
        {
    #pragma omp section
            for (i = 0; i < 1100; i++) {
                for (j = i; j < i + 4; j++) {
                    fprintf(new_file, "%f  ", S[j]);
                }
                fprintf(new_file, "\n");
            }
    #pragma omp section
            for (i = 0; i < 1100; i++) {
                fprintf(output_f, "%f\n", Res[i]);
            }
        }
    }
    
    double end = omp_get_wtime();
    printf("\n ------------- TIMING :: Parallel Version B -------------- ");
    printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);
    

    In the first version the calculation happens in parallel with writing out S, in the second version the calculations happen first, before S and Res are written to file in parallel. I wouldn't bet on which one is faster, so just try it out on your hardware.

    These can still be slower than the serial version, because spawning threads always has some overhead.