Search code examples
sortingcudathrust

thrust sort by key on device throws error


I have a device float array and I tried to sort it by key with THRUST with this function:

#include <thrust/sort.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>    

template <typename T>
__host__ T* deepCopyDeviceArray(T* dev_array, int arraysize)
{
// performs a deep copy of a device array and returns the copy's device pointer

cudaError_t cudaStatus;

T* dev_copiedArray;

cudaStatus = cudaMalloc((void**)&dev_copiedArray, (arraysize * sizeof(T)));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "deep copy cudaMalloc failed!");
}

cudaStatus = cudaMemcpy(dev_copiedArray, dev_array, (arraysize * sizeof(T)), cudaMemcpyDeviceToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "deep copy cudaMemcpy failed!");
}

return dev_copiedArray;
}



template <typename T>
int* sortByKeyOnDevice(T* dev_keys, int len, const int* valuesarray)
{
// sorts keysarray and returns the sorted indices
T* dev_keys2 = deepCopyDeviceArray(dev_keys, len); // make deep copy to evade change of original keys

// make deep copy of values and copy it to device
int* dev_values;
cudaMalloc((void **) &dev_values, len);
cudaMemcpy(dev_values, valuesarray, len * sizeof(int), cudaMemcpyHostToDevice);

// create device pointers
thrust::device_ptr<T> dev_ptr_keys = thrust::device_pointer_cast(dev_keys2);
thrust::device_ptr<int> dev_ptr_values = thrust::device_pointer_cast(dev_values);

thrust::sort_by_key(dev_ptr_keys, dev_ptr_keys + len, dev_ptr_values);

//thrust::device_free(dev_ptr_keys);
cudaFree(dev_keys2);
return dev_values; // return only indices of sorted array
}

int main()
{
int len = 10;
float* array1 = new float[len]; for (int i=0;i<len;i++) array1[i] = rand();

float* dev_array1;
cudaMalloc(&dev_array1, len * sizeof(float));
cudaMemcpy(dev_array1, array1, (len * sizeof(float)), cudaMemcpyHostToDevice);

int* valuesarray = new int[len]; for (int i=0; i<len; i++) valuesarray[i] = i;
int* dev_values;

dev_values = sortByKeyOnDevice(dev_array1, len, valuesarray);

int* values = new int[len];
cudaMemcpy(values, dev_values, (len * sizeof(int)), cudaMemcpyDeviceToHost); // or use dev_values in a kernel for further calculations
}

Executing this main, THRUST throws the exception: "thrust::system::system_error at memory location 0x00DAF5D4."

I don't want to use thrust device vectors because they are unnecessary here. According to the THRUST documentation, device_ptr can be used in the above way (I refer to this question).

What am I doing wrong?


Solution

  • Whenever you are having trouble with a CUDA code, you should put proper cuda error checking in place on every CUDA API call and kernel call (you need not do this for thrust calls, they have their own error reporting mechanism). You can also run your code with cuda-memcheck which will display API errors even if you haven't explicitly checked for them.

    If you had done that you would have discovered this line of code is reporting an API error (invalid parameter):

    cudaMemcpy(dev_values, valuesarray, len * sizeof(int), cudaMemcpyHostToDevice);
    

    Looking at the line before it, your size parameter is incorrect:

    cudaMalloc((void **) &dev_values, len);
    

    It should be:

    cudaMalloc((void **) &dev_values, len*sizeof(int));
    

    With that change, your code compiles and runs for me without any errors.

    As an aside, when posting code, please properly format (indent) to make it easier for others to read.