Search code examples
c++sortingcudathrust

Sorting statically allocated array using Thrust


In my code, I have a statically allocated array in global memory (i.e., allocated using __device__), which I want to sort using thrust::sort, which isn't working. All of the examples on this topic are using CUDA runtime allocated arrays (using cudaMalloc). Is there any way I can sort a statically allocated array?

I guess it has something to do with statically allocated memory not being accessible from the host. Using cudaMalloc-allocated arrays, it is working fine. However, I want to avoid using this type of allocation since static allocation allows for easier access to the data from device code (doesn't it?).

Minimal (not-) working example:

#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>

#define N 4

typedef struct element {
  int key;
  int value;
  __host__ __device__ bool operator<(element e) const
  { return key > e.key; }
} element;

__device__ element array[N];

__global__ void init() {
  for (int i = 0; i < N; ++i) {
    array[N - i - 1].key = i;
  }
}

__global__ void print_array() {
  for (int i = 0; i < N; ++i) {
    printf("%d ", array[i].key);
  }
  printf("\n");
}

int main(void) {
  thrust::device_ptr<element> array_first(array);

  init<<<1,1>>>();

  printf("unsorted: ");
  print_array<<<1, 1>>>();
  cudaDeviceSynchronize();

  thrust::sort(array_first, array_first + N);

  printf("sorted: ");
  print_array<<<1, 1>>>();
  cudaDeviceSynchronize();
}

Solution

  • Use cudaGetSymbolAddress to take the address of the array variable from a __host__ function:

    void* array_ptr = 0;
    cudaGetSymbolAddress(&array_ptr, array);
    thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));
    

    Here's the complete program:

    #include <stdio.h>
    #include <thrust/device_ptr.h>
    #include <thrust/sort.h>
    
    #define N 4
    
    typedef struct element {
      int key;
      int value;
      __host__ __device__ bool operator<(element e) const
      { return key > e.key; }
    } element;
    
    __device__ element array[N];
    
    __global__ void init() {
      for (int i = 0; i < N; ++i) {
        array[N - i - 1].key = i;
      }
    }
    
    __global__ void print_array() {
      for (int i = 0; i < N; ++i) {
        printf("%d ", array[i].key);
      }
      printf("\n");
    }
    
    int main(void) {
      cudaError_t error;
    
      void* array_ptr = 0;
      if(error = cudaGetSymbolAddress(&array_ptr, array))
      {
        throw thrust::system_error(error, thrust::cuda_category());
      }
    
      thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));
    
      init<<<1,1>>>();
    
      printf("unsorted: ");
      print_array<<<1, 1>>>();
    
      if(error = cudaDeviceSynchronize())
      {
        throw thrust::system_error(error, thrust::cuda_category());
      }
    
      thrust::sort(array_first, array_first + N);
    
      if(error = cudaDeviceSynchronize())
      {
        throw thrust::system_error(error, thrust::cuda_category());
      }
    
      printf("sorted: ");
      print_array<<<1, 1>>>();
    
      if(error = cudaDeviceSynchronize())
      {
        throw thrust::system_error(error, thrust::cuda_category());
      }
    
      return 0;
    }
    

    Here's the output on my system:

    $ nvcc test.cu -run
    unsorted: 3 2 1 0 
    sorted: 3 2 1 0 
    

    The sorted output is the same as the unsorted output, but I guess that is intentional given the way the data is generated and the definition of element::operator<.