I have a bool array "a flag array stored as char*" I want to efficiently sort it using radix sort, what I understand is that radix sort complexity is O(#bits N) which will be O(8N) for char* and O(N) for a bool* (if it exists)
I've tried thrust::stable_sort_by_key, on my device sorting 2^21 char pointer takes 1.7 ms another test was by using a compare function for the first bit only:
struct OBCmp {
__host__ __device__
bool operator()(const char& o1, const char& o2) {
return (o1 & 1) < (o2 & 1);
}
};
the result time using thrust::stable_sort_by_key(d_vec.begin(), d_vec.end(), d_vec_values.begin(), OBCmp ());
was 9 ms
but I'm almost sure that it has used another algorithm "not radix sort", and my guess is that if I can make radix sort to use only the first bit, the time should be around 0.3 ms
any hint how to do this? "even manually without thrust"
I suggest counting ones (or zeroes) and then creating the solution vector based on that. Here's a worked example using thrust:
$ cat t427.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/count.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE (1<<21)
int main(){
cudaEvent_t start, stop;
float et;
cudaEventCreate(&start); cudaEventCreate(&stop);
thrust::host_vector<char> h_data(DSIZE);
for (int i = 0; i < DSIZE; i++) h_data[i] = rand()%2;
thrust::device_vector<char> d_data = h_data;
cudaEventRecord(start);
thrust::sort(d_data.begin(), d_data.end());
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
printf("Sort time: %fms\n", et);
thrust::device_vector<char> d2_data = h_data;
thrust::device_vector<char> d3_data(DSIZE);
cudaEventRecord(start);
int zeroes = thrust::count(d2_data.begin(), d2_data.end(), 0);
thrust::fill(d3_data.begin(), d3_data.begin()+zeroes, 0);
thrust::fill(d3_data.begin() + zeroes, d3_data.end(), 1);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
printf("count/fill time: %fms\n", et);
if (thrust::equal(d_data.begin(), d_data.end(), d3_data.begin())) printf("Success!\n");
else printf("Failure\n");
return 0;
}
$ nvcc -O3 -arch=sm_20 -o t427 t427.cu
$ ./t427
Sort time: 1.450560ms
count/fill time: 0.302656ms
Success!
$