c - Dynamically linking a PGI OpenACC-enabled library with gcc

Previously, I asked a question regarding the creation of a static library with PGI and linking it to a program that is built with gcc: c - Linking a PGI OpenACC-enabled library with gcc

Now, I have the same question but dynamically. How can I built a program with gcc while my library is dynamically built with PGI?

And also, considering following facts:

I want both of them to recognize same OpenMP pragma and routines too. For instance, when I use OpenMP critical regions in the library, the whole program should be serialized at that section.
OpenACC pragmas are used in the library that was built with PGI.
Load library completely dynamic in my application. I mean using dlopen to open lib and dlsym to find functions.
I also want my threads to be able to simultaneously access GPU for data tranfer and/or computations. For more details see following code snippets.

For instance, building following lib and main code emits this error: call to cuMemcpyHtoDAsync returned error 1: Invalid value

Note: When building following codes, I intentionally used LibGOMP (-lgomp) instead of PGI's OpenMP library (-lpgmp) for both cases, lib and main.

Lib code:

#include <stdio.h>
#include <stdlib.h>
#include <openacc.h>
#include <omp.h>

double calculate_sum(int n, double *a) {
    double sum = 0;
    int i;

    #pragma omp critical
    {
        printf("Num devices: %d\n", acc_get_num_devices(acc_device_nvidia));

        #pragma acc enter data copyin(a[0:n])

        #pragma acc parallel 
        #pragma acc loop
        for(i=0;i<n;i++) {
            sum += a[i];
        }

        #pragma acc exit data delete(a[0:n])
    }

    return sum;
}


int ret_num_dev(int index) {
    int dev = acc_get_num_devices(acc_device_nvidia);
    if(dev == acc_device_nvidia)
        printf("Num devices: %d - Current device: %d\n", dev, acc_get_device());
    return dev;
}

Built library with following commands:

pgcc -acc -ta=nvidia:nordc -fPIC -c libmyacc.c

pgcc -shared -Wl,-soname,libctest.so.1 -o libmyacc.so -L/opt/pgi/linux86-64/16.5/lib -L/usr/lib64 -L/usr/lib/gcc/x86_64-redhat-linux/4.8.5 -laccapi -laccg -laccn -laccg2 -ldl -lcudadevice -lgomp -lnuma -lpthread -lnspgc -lpgc -lm -lgcc -lc -lgcc libmyacc.o

Main code:

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <dlfcn.h>


#define N 1000

// to make sure library is loaded just once for whole program
static void *lib_handle = NULL;
static int lib_loaded = 0;
static double (*calculate_sum2)(int , double *);

void call_lib_so() {

    // load library just once and init the function pointer
    // to function in the library.
    if(lib_loaded == 0) {
        lib_loaded = 1;
        char *error;

        lib_handle = dlopen("/home/millad/temp/gcc-pgi/libmyacc.so", RTLD_NOW);
        if (!lib_handle) {
            fprintf(stderr, "%s\n", dlerror());
            exit(1);
        }

        calculate_sum2 = (double (*)(int , double *)) dlsym(lib_handle, "calculate_sum");
        if ((error = dlerror()) != NULL)  {
            fprintf(stderr, "%s\n", error);
            exit(1);
        }
    }


    // execute the function per call
    int n = N, i;
    double *a = (double *) malloc(sizeof(double) * n);
    for(i=0;i<n;i++)
        a[i] = 1.0 * i;
    double sum = (*calculate_sum2)(n, a);
    free(a);
    printf("-------- SUM: %.3f\n", sum);



//  dlclose(lib_handle);
}


extern double calculate_sum(int n, double *a);

int main() {

    // allocation and initialization of an array
    double *a = (double*) malloc(sizeof(double) * N);
    int i;
    for(i=0;i<N;i++) {
        a[i] = (i+1) * 1.0;
    }

    // access and run OpenACC region with all threads
    #pragma omp parallel
    call_lib_so();

    return 0;
}

And built my main code with following command using gcc as described by Mat in my previous question:

gcc f1.c -L/opt/pgi/linux86-64/16.5/lib -L/usr/lib64 -L/usr/lib/gcc/x86_64-redhat-linux/4.8.5 -L. -laccapi -laccg -laccn -laccg2 -ldl -lcudadevice -lgomp -lnuma -lpthread -lnspgc -lpgc -lm -lgcc -lc -lgcc -lmyacc

Am I doing something wrong? Are above steps correct?

Solution

Your code works correctly for me. I tried to use what you listed but needed to remove the "libctest.so", change the location where dlopen gets the so, and add "-DN=1024" on the gcc compilation line. After that, it compiled and ran fine.

% pgcc -acc -ta=nvidia:nordc -fPIC -c libmyacc.c -V16.5                            
% pgcc -shared -o libmyacc.so -L/opt/pgi/linux86-64/16.5/lib -L/usr/lib64 -L/usr/lib/gcc/x86_64-redhat-linux/4.8.5 -laccapi -laccg -laccn -laccg2 -ldl -lcudadevice -lgomp -lnuma -lpthread -lnspgc -lpgc -lm -lgcc -lc -lgcc libmyacc.o -V16.5
% gcc f1.c -L/proj/pgi/linux86-64/16.5/lib -L/usr/lib64 -L/usr/lib/gcc/x86_64-redhat-linux/4.8.5 -L. -laccapi -laccg -laccn -laccg2 -ldl -lcudadevice -lgomp -lnuma -lpthread -lnspgc -lpgc -lm -lgcc -lc -lgcc -lmyacc -DN=1024
% ./a.out
Num devices: 8
-------- SUM: 523776.000