Search code examples
cmpiopenmphpcreduction

How to implement MPI_MINLOC in OpenMP?


How can I compute a global minimum and an index attached to the minimum value in OpenMP using the C language?

Can I obtain a real benefit with few threads?

// thread-private result
double mymin = DBL_MAX;
double myloc = SIZE_MAX;

// bottleneck parallel part
#pragma omp for reduction(min:gmin)
for (size_t i=0; i<n; ++i) {
    if (v[i] < mymin) {
        mymin = v[i];
        gmin = v[i];
        myloc = i;
    }
}

if(gmin == mymin) {
// find global result
  #pragma omp single
  {
     gloc = myloc;
  }
}

Solution

  • Yes, you can obtain a benefit for large problems using OpenMP here. The benefit will be realized when the input is large enough to be memory-bandwidth-limited and multiple threads provide higher memory bandwidth. This will almost certainly be limited to inputs that do not fit in the last level of cache.

    I've included a simple example program on how to do this. This implementation uses old and widely-available features. It may be possible to use OpenMP 5.0 user-defined reductions but I doubt it this provides a noticeable benefit in performance and you'll likely find many implementations do not support this feature.

    Note that my example program does not time repeated tests nor do any statistics on the timings. Thus, you should not consider it a highly scientific test. However, it demonstrates a clear benefit on my dual-core laptop for an array of 100M doubles, as expected.

    $ make minloc && ./minloc 100000000
    icc -O3 -qopenmp -std=c11 minloc.c -o minloc
    MINLOC of 100000000 elements with 4 threads
    OpenMP: dt=0.076681, gmin=0.000000, gloc=4022958
    Sequential: dt=0.157333, gmin=0.000000, gloc=4022958
    SUCCESS
    

    Relevant Excerpt

            // thread-private result
            double mymin = DBL_MAX;
            double myloc = SIZE_MAX;
    
            // bottleneck parallel part
            #pragma omp for
            for (size_t i=0; i<n; ++i) {
                if (v[i] < mymin) {
                    mymin = v[i];
                    myloc = i;
                }
            }
    
            // write thread-private results to shared
            tmin[me] = mymin;
            tloc[me] = myloc;
            #pragma omp barrier
    
            // find global result
            #pragma omp master
            {
                for (int i=0; i<nt; ++i) {
                    if (tmin[i] < gmin) {
                        gmin = tmin[i];
                        gloc = tloc[i];
                    }
                }
            }
    

    Complete Example Program

    #include <stdio.h>
    #include <stdlib.h>
    
    #include <float.h>
    
    #include <omp.h>
    
    int main(int argc, char* argv[])
    {
        size_t n = (argc > 1) ? atol(argv[1]) : 100;
    
        double * v = malloc(n * sizeof(double));
        if (v==NULL) abort();
    
        // g = global
        double gmin = DBL_MAX;
        size_t gloc = SIZE_MAX;
    
        const int mt = omp_get_max_threads();
    
        printf("MINLOC of %zu elements with %d threads\n", n, mt);
    
        // t = thread
        double * tmin = malloc(mt * sizeof(double));
        size_t * tloc = malloc(mt * sizeof(size_t));
        if (tmin==NULL || tloc==NULL) abort();
    
        for (int i=0; i<mt; ++i) {
            tmin[i] = DBL_MAX;
            tloc[i] = SIZE_MAX;
        }
    
        double dt = 0.0;
    
        #pragma omp parallel firstprivate(n) shared(v, tmin, tloc, gmin, gloc, dt)
        {
            const int me = omp_get_thread_num();
            const int nt = omp_get_num_threads();
    
            unsigned int seed = (unsigned int)me;
            srand(seed);
            #pragma omp for
            for (size_t i=0; i<n; ++i) {
                // this is not a _good_ random number generator, but it does not matter for this use case
                double r = (double)rand_r(&seed) / (double)RAND_MAX;
                v[i] = r;
            }
    
            double t0 = 0.0;
    
            #pragma omp barrier
            #pragma omp master
            {
                t0 = omp_get_wtime();
            }
    
            // thread-private result
            double mymin = DBL_MAX;
            double myloc = SIZE_MAX;
    
            // bottleneck parallel part
            #pragma omp for
            for (size_t i=0; i<n; ++i) {
                if (v[i] < mymin) {
                    mymin = v[i];
                    myloc = i;
                }
            }
    
            // write thread-private results to shared
            tmin[me] = mymin;
            tloc[me] = myloc;
            #pragma omp barrier
    
            // find global result
            #pragma omp master
            {
                for (int i=0; i<nt; ++i) {
                    if (tmin[i] < gmin) {
                        gmin = tmin[i];
                        gloc = tloc[i];
                    }
                }
            }
    
            #pragma omp barrier
            #pragma omp master
            {
                double t1 = omp_get_wtime();
                dt = t1 - t0;
            }
    
    #if 0
            #pragma omp critical
            {
                printf("%d: mymin=%f, myloc=%zu\n", me, mymin, myloc);
                fflush(stdout);
            }
    #endif
        }
    
        printf("OpenMP: dt=%f, gmin=%f, gloc=%zu\n", dt, gmin, gloc);
        fflush(stdout);
    
        // sequential reference timing
        {
            double t0 = omp_get_wtime();
    
            double mymin = DBL_MAX;
            double myloc = SIZE_MAX;
    
            for (size_t i=0; i<n; ++i) {
                if (v[i] < mymin) {
                    mymin = v[i];
                    myloc = i;
                }
            }
    
            gmin = mymin;
            gloc = myloc;
    
            double t1 = omp_get_wtime();
            dt = t1 - t0;
        }
    
        printf("Sequential: dt=%f, gmin=%f, gloc=%zu\n", dt, gmin, gloc);
        fflush(stdout);
    
        // debug printing for toy inputs
        if (n<100) {
            for (size_t i=0; i<n; ++i) {
                printf("v[%zu]=%f\n", i , v[i]);
            }
            fflush(stdout);
        }
    
        free(v);
    
        printf("SUCCESS\n");
    
        return 0;
    }