Search code examples
cudaopenmpthrust

Multi-gpu CUDA Thrust


I have a Cuda C++ code that uses Thrust currently working properly on a single GPU. I'd now like to modify it for multi-gpu. I have a host function that includes a number of Thrust calls that sort, copy, calculate differences etc on device arrays. I want to use each GPU to run this sequence of Thrust calls on it's own (independent) set of arrays at the same time. I've read that Thrust functions that return values are synchronous but can I use OpenMP to have each host thread call up a function (with Thrust calls) that runs on a separate GPU?

For example (coded in browser):

#pragma omp parallel for 
for (int dev=0; dev<Ndev; dev++){
   cudaSetDevice(dev);
   runthrustfunctions(dev);
}

void runthrustfunctions(int dev){
  /*lots of Thrust functions running on device arrays stored on corresponding GPU*/
 //for example this is just a few of the lines"

 thrust::device_ptr<double> pos_ptr = thrust::device_pointer_cast(particle[dev].pos);
 thrust::device_ptr<int> list_ptr = thrust::device_pointer_cast(particle[dev].list);
 thrust::sequence(list_ptr,list_ptr+length);
 thrust::sort_by_key(pos_ptr, pos_ptr+length,list_ptr);
 thrust::device_vector<double> temp(length);
 thrust::gather(list_ptr,list_ptr+length,pos_ptr,temp.begin());   
 thrust::copy(temp.begin(), temp.end(), pos_ptr);

}`

I think I also need the structure "particle[0]" to be stored on GPU 0, particle[1] on GPU 1 etc and I my guess is this not possible. An option might be to use "switch" with separate code for each GPU case.

I'd like to know if this is a correct approach or if there is a better way? Thanks


Solution

  • Yes, you can combine thrust and OpenMP.

    Here's a complete worked example with results:

    $ cat t340.cu
    #include <omp.h>
    #include <stdio.h> 
    #include <stdlib.h>
    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include <thrust/sort.h>
    #include <thrust/copy.h>
    #include <time.h>
    #include <sys/time.h>
    
    #define DSIZE 200000000
    
    using namespace std;
    
    int main(int argc, char *argv[])
    {
        timeval t1, t2;
        int num_gpus = 0;   // number of CUDA GPUs
    
        printf("%s Starting...\n\n", argv[0]);
    
        // determine the number of CUDA capable GPUs
        cudaGetDeviceCount(&num_gpus);
    
        if (num_gpus < 1)
        {
            printf("no CUDA capable devices were detected\n");
            return 1;
        }
    
        // display CPU and GPU configuration
        printf("number of host CPUs:\t%d\n", omp_get_num_procs());
        printf("number of CUDA devices:\t%d\n", num_gpus);
    
        for (int i = 0; i < num_gpus; i++)
        {
            cudaDeviceProp dprop;
            cudaGetDeviceProperties(&dprop, i);
            printf("   %d: %s\n", i, dprop.name);
        }
    
        printf("initialize data\n");
    
    
        // initialize data
        typedef thrust::device_vector<int> dvec;
        typedef dvec *p_dvec;
        std::vector<p_dvec> dvecs;
    
        for(unsigned int i = 0; i < num_gpus; i++) {
          cudaSetDevice(i);
          p_dvec temp = new dvec(DSIZE);
          dvecs.push_back(temp);
          }
    
        thrust::host_vector<int> data(DSIZE);
        thrust::generate(data.begin(), data.end(), rand);
    
        // copy data
        for (unsigned int i = 0; i < num_gpus; i++) {
          cudaSetDevice(i);
          thrust::copy(data.begin(), data.end(), (*(dvecs[i])).begin());
          }
    
        printf("start sort\n");
        gettimeofday(&t1,NULL);
    
        // run as many CPU threads as there are CUDA devices
        omp_set_num_threads(num_gpus);  // create as many CPU threads as there are CUDA devices
        #pragma omp parallel
        {
            unsigned int cpu_thread_id = omp_get_thread_num();
            cudaSetDevice(cpu_thread_id);
            thrust::sort((*(dvecs[cpu_thread_id])).begin(), (*(dvecs[cpu_thread_id])).end());
            cudaDeviceSynchronize();
        }
        gettimeofday(&t2,NULL);
        printf("finished\n");
        unsigned long et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
        if (cudaSuccess != cudaGetLastError())
            printf("%s\n", cudaGetErrorString(cudaGetLastError()));
        printf("sort time = %fs\n", (float)et/(float)(1000000));
        // check results
        thrust::host_vector<int> result(DSIZE);
        thrust::sort(data.begin(), data.end());
        for (int i = 0; i < num_gpus; i++)
        {
            cudaSetDevice(i);
            thrust::copy((*(dvecs[i])).begin(), (*(dvecs[i])).end(), result.begin());
            for (int j = 0; j < DSIZE; j++)
              if (data[j] != result[j]) { printf("mismatch on device %d at index %d, host: %d, device: %d\n", i, j, data[j], result[j]); return 1;}
        }
        printf("Success\n");
        return 0;
    
    }
    $ nvcc -Xcompiler -fopenmp -O3 -arch=sm_20 -o t340 t340.cu -lgomp
    $ CUDA_VISIBLE_DEVICES="0" ./t340
    ./t340 Starting...
    
    number of host CPUs:    12
    number of CUDA devices: 1
       0: Tesla M2050
    initialize data
    start sort
    finished
    sort time = 0.398922s
    Success
    $ ./t340
    ./t340 Starting...
    
    number of host CPUs:    12
    number of CUDA devices: 4
       0: Tesla M2050
       1: Tesla M2070
       2: Tesla M2050
       3: Tesla M2070
    initialize data
    start sort
    finished
    sort time = 0.460058s
    Success
    $
    

    We can see that when I restrict the program to using a single device, the sort operation takes about 0.4 seconds. Then when I allow it to use all 4 devices (repeating the same sort on all 4 devices) the overall operation only take 0.46 seconds, even though we're doing 4 times as much work.

    For this particular case I happened to be using CUDA 5.0 with thrust v1.7, and gcc 4.4.6 (RHEL 6.2)