Sorting statically allocated array using Thrust

In my code, I have a statically allocated array in global memory (i.e., allocated using __device__), which I want to sort using thrust::sort, which isn't working. All of the examples on this topic are using CUDA runtime allocated arrays (using cudaMalloc). Is there any way I can sort a statically allocated array?

I guess it has something to do with statically allocated memory not being accessible from the host. Using cudaMalloc-allocated arrays, it is working fine. However, I want to avoid using this type of allocation since static allocation allows for easier access to the data from device code (doesn't it?).

Minimal (not-) working example:

#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>

#define N 4

typedef struct element {
  int key;
  int value;
  __host__ __device__ bool operator<(element e) const
  { return key > e.key; }
} element;

__device__ element array[N];

__global__ void init() {
  for (int i = 0; i < N; ++i) {
    array[N - i - 1].key = i;
  }
}

__global__ void print_array() {
  for (int i = 0; i < N; ++i) {
    printf("%d ", array[i].key);
  }
  printf("\n");
}

int main(void) {
  thrust::device_ptr<element> array_first(array);

  init<<<1,1>>>();

  printf("unsorted: ");
  print_array<<<1, 1>>>();
  cudaDeviceSynchronize();

  thrust::sort(array_first, array_first + N);

  printf("sorted: ");
  print_array<<<1, 1>>>();
  cudaDeviceSynchronize();
}

Solution

Use cudaGetSymbolAddress to take the address of the array variable from a __host__ function:

void* array_ptr = 0;
cudaGetSymbolAddress(&array_ptr, array);
thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));

Here's the complete program:

#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>

#define N 4

typedef struct element {
  int key;
  int value;
  __host__ __device__ bool operator<(element e) const
  { return key > e.key; }
} element;

__device__ element array[N];

__global__ void init() {
  for (int i = 0; i < N; ++i) {
    array[N - i - 1].key = i;
  }
}

__global__ void print_array() {
  for (int i = 0; i < N; ++i) {
    printf("%d ", array[i].key);
  }
  printf("\n");
}

int main(void) {
  cudaError_t error;

  void* array_ptr = 0;
  if(error = cudaGetSymbolAddress(&array_ptr, array))
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));

  init<<<1,1>>>();

  printf("unsorted: ");
  print_array<<<1, 1>>>();

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  thrust::sort(array_first, array_first + N);

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  printf("sorted: ");
  print_array<<<1, 1>>>();

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  return 0;
}

Here's the output on my system:

$ nvcc test.cu -run
unsorted: 3 2 1 0 
sorted: 3 2 1 0

The sorted output is the same as the unsorted output, but I guess that is intentional given the way the data is generated and the definition of element::operator<.