Search code examples
c++algorithmparallel-processingcudathrust

Thrust CUDA find maximum per each group(segment)


My data like

value = [1, 2, 3, 4, 5, 6]
key =   [0, 1, 0, 2, 1, 2]

I need to now maximum(value and index) per each group(key). So the result should be

max = [3, 5, 6]
index = [2, 4, 5]
key = [0, 1, 2]

How can I get it with cuda thrust? I can do sort -> reduce_by_key but it's not really efficient. In my case vector size > 10M and key space ~ 1K(starts from 0 without gaps).


Solution

  • Since the original question focused on thrust, I didn't have any suggestions other than what I mentioned in the comments,

    However, based on further dialog in the comments, I thought I would post an answer that covers both CUDA and thrust.

    The thrust method uses a sort_by_key operation to group like keys together, followed by a reduce_by_key operation to find the max + index for each key-group.

    The CUDA method uses a custom atomic approach I describe here to find a 32-bit max plus 32-bit index (for each key-group).

    The CUDA method is substantially (~10x) faster, for this specific test case. I used a vector size of 10M and a key size of 10K for this test.

    My test platform was CUDA 8RC, RHEL 7, and Tesla K20X GPU. K20X is a member of the Kepler generation which has much faster global atomics than previous GPU generations.

    Here's a fully worked example, covering both cases, and providing a timing comparison:

    $ cat t1234.cu
    #include <iostream>
    #include <thrust/copy.h>
    #include <thrust/reduce.h>
    #include <thrust/sort.h>
    #include <thrust/device_vector.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/sequence.h>
    #include <thrust/functional.h>
    #include <cstdlib>
    
    #include <time.h>
    #include <sys/time.h>
    #define USECPSEC 1000000ULL
    
    unsigned long long dtime_usec(unsigned long long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    
    const size_t ksize = 10000;
    const size_t vsize = 10000000;
    const int nTPB = 256;
    
    struct my_max_func
    {
    
      template <typename T1, typename T2>
      __host__ __device__
      T1 operator()(const T1 t1, const T2 t2){
        T1 res;
        if (thrust::get<0>(t1) > thrust::get<0>(t2)){
          thrust::get<0>(res) = thrust::get<0>(t1);
          thrust::get<1>(res) = thrust::get<1>(t1);}
        else {
          thrust::get<0>(res) = thrust::get<0>(t2);
          thrust::get<1>(res) = thrust::get<1>(t2);}
        return res;
        }
    };
    
    typedef union  {
      float floats[2];                 // floats[0] = maxvalue
      int ints[2];                     // ints[1] = maxindex
      unsigned long long int ulong;    // for atomic update
    } my_atomics;
    
    
    __device__ unsigned long long int my_atomicMax(unsigned long long int* address, float val1, int val2)
    {
        my_atomics loc, loctest;
        loc.floats[0] = val1;
        loc.ints[1] = val2;
        loctest.ulong = *address;
        while (loctest.floats[0] <  val1)
          loctest.ulong = atomicCAS(address, loctest.ulong,  loc.ulong);
        return loctest.ulong;
    }
    
    
    __global__ void my_max_idx(const float *data, const int *keys,const int ds, my_atomics *res)
    {
    
        int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
        if (idx < ds)
          my_atomicMax(&(res[keys[idx]].ulong), data[idx],idx);
    }
    
    
    int main(){
    
      float *h_vals = new float[vsize];
      int   *h_keys = new int[vsize];
      for (int i = 0; i < vsize; i++) {h_vals[i] = rand(); h_keys[i] = rand()%ksize;}
    // thrust method
      thrust::device_vector<float> d_vals(h_vals, h_vals+vsize);
      thrust::device_vector<int> d_keys(h_keys, h_keys+vsize);
      thrust::device_vector<int> d_keys_out(ksize);
      thrust::device_vector<float> d_vals_out(ksize);
      thrust::device_vector<int> d_idxs(vsize);
      thrust::device_vector<int> d_idxs_out(ksize);
    
      thrust::sequence(d_idxs.begin(), d_idxs.end());
      cudaDeviceSynchronize();
      unsigned long long et = dtime_usec(0);
    
      thrust::sort_by_key(d_keys.begin(), d_keys.end(), thrust::make_zip_iterator(thrust::make_tuple(d_vals.begin(), d_idxs.begin())));
      thrust::reduce_by_key(d_keys.begin(), d_keys.end(), thrust::make_zip_iterator(thrust::make_tuple(d_vals.begin(),d_idxs.begin())), d_keys_out.begin(), thrust::make_zip_iterator(thrust::make_tuple(d_vals_out.begin(), d_idxs_out.begin())), thrust::equal_to<int>(), my_max_func());
      cudaDeviceSynchronize();
      et = dtime_usec(et);
      std::cout << "Thrust time: " << et/(float)USECPSEC << "s" << std::endl;
    
    // cuda method
    
      float *vals;
      int *keys;
      my_atomics *results;
      cudaMalloc(&keys, vsize*sizeof(int));
      cudaMalloc(&vals, vsize*sizeof(float));
      cudaMalloc(&results, ksize*sizeof(my_atomics));
    
      cudaMemset(results, 0, ksize*sizeof(my_atomics)); // works because vals are all positive
      cudaMemcpy(keys, h_keys, vsize*sizeof(int), cudaMemcpyHostToDevice);
      cudaMemcpy(vals, h_vals, vsize*sizeof(float), cudaMemcpyHostToDevice);
      et = dtime_usec(0);
    
      my_max_idx<<<(vsize+nTPB-1)/nTPB, nTPB>>>(vals, keys, vsize, results);
      cudaDeviceSynchronize();
      et = dtime_usec(et);
      std::cout << "CUDA time: " << et/(float)USECPSEC << "s" << std::endl;
    
    // verification
    
      my_atomics *h_results = new my_atomics[ksize];
      cudaMemcpy(h_results, results, ksize*sizeof(my_atomics), cudaMemcpyDeviceToHost);
      for (int i = 0; i < ksize; i++){
        if (h_results[i].floats[0] != d_vals_out[i]) {std::cout << "value mismatch at index: " << i << " thrust: " << d_vals_out[i] << " CUDA: " << h_results[i].floats[0] << std::endl; return -1;}
        if (h_results[i].ints[1] != d_idxs_out[i]) {std::cout << "index mismatch at index: " << i << " thrust: " << d_idxs_out[i] << " CUDA: " << h_results[i].ints[1] << std::endl; return -1;}
        }
    
      std::cout << "Success!" << std::endl;
      return 0;
    }
    
    $ nvcc -arch=sm_35 -o t1234 t1234.cu
    $ ./t1234
    Thrust time: 0.026593s
    CUDA time: 0.002451s
    Success!
    $