Search code examples
c++cudathrust

sorting 1-bit array using radix sort?


I have a bool array "a flag array stored as char*" I want to efficiently sort it using radix sort, what I understand is that radix sort complexity is O(#bits N) which will be O(8N) for char* and O(N) for a bool* (if it exists)

I've tried thrust::stable_sort_by_key, on my device sorting 2^21 char pointer takes 1.7 ms another test was by using a compare function for the first bit only:

struct OBCmp {
    __host__ __device__
    bool operator()(const char& o1, const char& o2) {
        return (o1 & 1) < (o2 & 1);
    }
};

the result time using thrust::stable_sort_by_key(d_vec.begin(), d_vec.end(), d_vec_values.begin(), OBCmp ()); was 9 ms

but I'm almost sure that it has used another algorithm "not radix sort", and my guess is that if I can make radix sort to use only the first bit, the time should be around 0.3 ms

any hint how to do this? "even manually without thrust"


Solution

  • I suggest counting ones (or zeroes) and then creating the solution vector based on that. Here's a worked example using thrust:

    $ cat t427.cu
    #include <thrust/device_vector.h>
    #include <thrust/host_vector.h>
    #include <thrust/sort.h>
    #include <thrust/count.h>
    #include <stdio.h>
    #include <stdlib.h>
    
    #define DSIZE (1<<21)
    int main(){
      cudaEvent_t start, stop;
      float et;
      cudaEventCreate(&start); cudaEventCreate(&stop);
      thrust::host_vector<char> h_data(DSIZE);
      for (int i = 0; i < DSIZE; i++) h_data[i] = rand()%2;
      thrust::device_vector<char> d_data = h_data;
      cudaEventRecord(start);
      thrust::sort(d_data.begin(), d_data.end());
      cudaEventRecord(stop);
      cudaEventSynchronize(stop);
      cudaEventElapsedTime(&et, start, stop);
      printf("Sort time: %fms\n", et);
      thrust::device_vector<char> d2_data = h_data;
      thrust::device_vector<char> d3_data(DSIZE);
      cudaEventRecord(start);
      int zeroes = thrust::count(d2_data.begin(), d2_data.end(), 0);
      thrust::fill(d3_data.begin(), d3_data.begin()+zeroes, 0);
      thrust::fill(d3_data.begin() + zeroes, d3_data.end(), 1);
      cudaEventRecord(stop);
      cudaEventSynchronize(stop);
      cudaEventElapsedTime(&et, start, stop);
      printf("count/fill time: %fms\n", et);
      if (thrust::equal(d_data.begin(), d_data.end(), d3_data.begin())) printf("Success!\n");
      else printf("Failure\n");
      return 0;
    }
    
    $ nvcc -O3 -arch=sm_20 -o t427 t427.cu
    $ ./t427
    Sort time: 1.450560ms
    count/fill time: 0.302656ms
    Success!
    $