CUDA + OpenMP Multi-GPU Batch processing

I am trying to do the following: I assume that I have a system with heterogeneous processing units (PUs), including CPUs, GPUs, and Intel Xeon Phis. The GPU devices can also have different characteristics. Hence, splitting the workload across these devices is not as simple as N/num_devices.

omp_set_num_threads(system->getPUCount());
#pragma omp parallel
{
    unsigned int cpu_thread_id = omp_get_thread_num();
    unsigned int num_cpu_threads = omp_get_num_threads();

Each thread iterates in a loop until the end of the data is reached.

    PU pu = listOfPUs[cpu_thread_id];

    //threads are active until all data is processed
    while (finish_0 < N) {
        //the my_start and my_finish are private to a specific device.
        int my_start = 0;
        int my_finish = 0;

I have set a constant chunk_size for each PU, and I create as many CPU threads as I have PUs, meaning that each CPU thread controls one of the PUs. Each thread, determines its own start and end of the chunk of the data (a critical code section)

#pragma omp critical (chunkdetermination_0)
{
    start_0 = finish_0;
    finish_0 = start_0 + pu.getChunkSize();

    if(finish_0 > N)
        finish_0 = N;

    my_start = start_0;
    my_finish = finish_0;
}

Now I check the type of the PU, and execute the corresponding kernel.

if(pu.getType() == GPU) {

            int myN = my_finish-my_start;

            CudaSafeCall(cudaSetDevice(pu.getId()));

            unsigned int nbytes_per_kernel = sizeof(double)*myN;

            //memory allocation
            CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
            CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));

            CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
            CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
            //data transfer
            CudaSafeCall(cudaMemcpy(d_a, a+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
            CudaSafeCall(cudaMemcpy(d_c, c+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));


            //block and grid values
            dim3 gpu_threads(128);
            dim3 gpu_blocks(myN/gpu_threads.x);
            if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;

            //execute kernel
            kernel_0<<<gpu_blocks,gpu_threads>>>( d_a,  d_c, myN);

            //data transfer device to host
            CudaSafeCall(cudaMemcpy(c+my_start, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));

            //sycnhronize devices
            CudaSafeCall(cudaDeviceSynchronize());

            // //free device memory
            CudaSafeCall(cudaFree(d_a));
            CudaSafeCall(cudaFree(d_c));
        }

When I test this code with one GPU, it works fine. However, when I test it with two GPUs, it does not work. I have also tried using cuda streams, but unfortunately could not manage it to work.

Any suggestions what am I doing wrong, or how should I solve this problem?

Here is the complete example:

#include <omp.h>
#include <stdio.h>
#include <vector>
#include <iostream>
#include <sys/time.h>
#include <float.h>
#include <limits.h>

using namespace std;

#define CPU 0
#define GPU 1
#define MIC 2

class PU
{
public:
    PU(int puId, int puType)
    {
        id = puId;
        type = puType;
    }

    int getId() {
        return id;
    }

    void setId(int puId) {
        id = puId;
    }

    int getType() {
        return type;
    }

    char * getTypeAsString() {
        if(type == CPU)
            return (char *) "CPU";
        else if (type == GPU)
            return (char *) "GPU";
        else
            return (char *) "MIC";

    }

    void setType(int puType) {
        type = puType;
    }

    int getChunkSize() {
        return chunkSize;
    }

    void setChunkSize(int puChunkSize) {
        chunkSize = puChunkSize;
    }

private:
    int id;
    int type;
    int chunkSize;
};

class System
{
public:
    System() {
        numOfPUs = 0;

        //Adding PU0 of type GPU to the system
        PU * pu0 = new PU(0, GPU);
        pu0->setChunkSize(262144);
        listOfPUs.push_back(*pu0);
        numOfPUs ++;

        //Adding PU1 of type GPU to the system
        PU * pu1 = new PU(1, GPU);
        pu1->setChunkSize(262144);
        listOfPUs.push_back(*pu1);
        numOfPUs ++;

    }

    vector<PU> getPUs() {
        return listOfPUs;
    }

    int getPUCount() {
        return numOfPUs;
    }

private:
    vector<PU> listOfPUs;

    int numOfPUs;
};


#define N   2097152


//********************** CUDA Error checker **********************
#define CUDA_ERROR_CHECK

#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )

inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
                 file, line, cudaGetErrorString( err ) );
        exit( -1 );
    }
#endif

    return;
}

inline void __cudaCheckError( const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
    cudaError err = cudaGetLastError();
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
                 file, line, cudaGetErrorString( err ) );
        exit( -1 );
    }

    // More careful checking. However, this will affect performance.
    // Comment away if needed.
    err = cudaDeviceSynchronize();
    if( cudaSuccess != err )
    {
        fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
                 file, line, cudaGetErrorString( err ) );
        exit( -1 );
    }
#endif

    return;
}
//********************** CUDA Error checker **********************


__global__ void kernel_0(double * a, double * c, int len)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < len)
    {
        c[idx] = a[idx];
    }
}
/*
 * Start of Generated Code
 * This code enables execution on both host CPUs and accelerating devices
 */
void hybrid_function_0 (double *a, double *c)
{
    System * system = new System();

    //device variable declarations
    double *d_a;
    double *d_c;

    //start and finish determine the chunk size of a device
    int start_0     = 0;
    int finish_0    = 0;

    vector<PU> listOfPUs = system->getPUs();

    printf("Num devices = %d\n", system->getPUCount());
    omp_set_num_threads(system->getPUCount());
    //one thread of the CPU controls one GPU device. The rest of CPU threads will be used to process data.
    #pragma omp parallel
    {
        unsigned int cpu_thread_id = omp_get_thread_num();
        unsigned int num_cpu_threads = omp_get_num_threads();

        PU pu = listOfPUs[cpu_thread_id];

        //threads are active until all data is processed
        while (finish_0 < N) {
            //the my_start and my_finish are private to a specific device.
            int my_start = 0;
            int my_finish = 0;

            //the determination of chunks should be performed sequentially, in order to avoid two or more devices processing the same data.
            #pragma omp critical (chunkdetermination_0)
            {
                start_0 = finish_0;
                finish_0 = start_0 + pu.getChunkSize();

                if(finish_0 > N)
                    finish_0 = N;

                my_start = start_0;
                my_finish = finish_0;
            }

            //devices with id less than nDevices are GPU devices. The host CPU has id = nDevices
            if(pu.getType() == GPU) {

                int myN = my_finish-my_start;

                printf("device_id\t%d\tpu_id\t%d\ttype\t%s\tprocessing\t%d-%d (%lu KB)\n", cpu_thread_id, pu.getId(), pu.getTypeAsString(), my_start, my_finish, sizeof(double)*myN/1000);
                CudaSafeCall(cudaSetDevice(pu.getId()));

                unsigned int nbytes_per_kernel = sizeof(double)*myN;

                //memory allocation
                CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
                CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));

                CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
                CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
                //data transfer
                CudaSafeCall(cudaMemcpy(d_a, a+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
                CudaSafeCall(cudaMemcpy(d_c, c+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));


                //block and grid values
                dim3 gpu_threads(128);
                dim3 gpu_blocks(myN/gpu_threads.x);
                if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;

                //execute kernel
                kernel_0<<<gpu_blocks,gpu_threads>>>( d_a,  d_c, myN);

                //data transfer device to host
                CudaSafeCall(cudaMemcpy(c+my_start, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));

                //sycnhronize devices
                CudaSafeCall(cudaDeviceSynchronize());

                // //free device memory
                CudaSafeCall(cudaFree(d_a));
                CudaSafeCall(cudaFree(d_c));
            }
            //execute on host
            else if (pu.getType() == CPU) {
                omp_set_num_threads(omp_get_max_threads());
                #pragma omp parallel for
                for (int  i = my_start; i < my_finish; i++)
                {
                    c[i] = a[i];
                }
            }
            //execute on MIC
            else if (pu.getType() == MIC) {
                #pragma offload target(mic: cpu_thread_id) in(a[my_start:my_finish]) in(c[my_start:my_finish])  out(c[my_start:my_finish])
                {
                    #pragma omp parallel for
                    for (int  i = my_start; i < my_finish; i++)
                    {
                        c[i] = a[i];
                    }
                }
            }
        }
    }
}
/*
 * End of Generated Code
 */

int main()
{

    double *a, *b, *c;
    double scalar;

    /* Allocate memory on host */
    a = (double*)malloc(sizeof(double)*N);
    b = (double*)malloc(sizeof(double)*N);
    c = (double*)malloc(sizeof(double)*N);

    // omp_set_num_threads(omp_get_max_threads());

    printf("OMP Max threads %d\n", omp_get_max_threads());
    #pragma omp parallel
    {
        #pragma omp master
        printf("OMP Num threads %d\n", omp_get_num_threads());
    }

    // #pragma omp parallel for
    // for(int i = 0; i < 10; i++) {
    //     printf("I am thread %d\n", omp_get_thread_num());
    // }
    //initialization of variables
    #pragma omp parallel for
    for (int j=0; j<N; j++) {
        a[j] = 1.0;
        b[j] = 2.0;
        c[j] = 0.0;
    }

    #pragma omp parallel for
    for (int j = 0; j < N; j++)
        a[j] = 2.0E0 * a[j];

    scalar=3.0f;

         printf("%s\n", "COPY Started");
         hybrid_function_0(a, c);
         printf("%s\n", "COPY Finished");

    return 0;
}

You can compile it with:

nvcc mini.cu -o mini -Xcompiler "-fopenmp"

When assigning two or more GPUs to run this class, i receive different error messages, and some times it just hangs and does nothing. When I check the GPU status through nvidia-smi it shows that the GPUs are executing this class, however the utilization is 0%.

Some of the errors include:

cudaSafeCall() failed at mini.cu:221 : invalid argument
cudaSafeCall() failed at mini.cu:221 : driver shutting down

Solution

I would imagine that the device variable declarations within hybrid_function_0should be moved to within the main OpenMP parallel section, something like this:

#pragma omp parallel
{
    unsigned int cpu_thread_id = omp_get_thread_num();
    unsigned int num_cpu_threads = omp_get_num_threads();

    //device variable declarations
    double *d_a;
    double *d_c;
    ...

As it stands, there is a memory race between threads which can result in the device variable pointer values being overwritten with allocations from the wrong device by other threads, leading to the sorts of invalid argument errors you report in API calls which use the device pointers as arguments.

I would expect that the error could occur at any of the cudaMemset, cudaMemcpy, or cudaFree calls or the kernel launch in that function and the error might move from place to place depending on the behaviour of competing threads.