memory location error: thrust::stable_sort when using big array and user-defined comparison operator

I'm running this code to sort big array of IPs using thrust stable_sort and user defined operator to compare the IPs. this code is working for arrays less than 50000 IPs, but I got a memory error for big arrays. here is the code I used:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <device_functions.h>
template<typename T>
struct vector_less
{
    typedef T first_argument_type;
    typedef T second_argument_type;
    typedef bool result_type;
    __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
        if (lhs[0] == rhs[0])
        if (lhs[1] == rhs[1])
        if (lhs[2] == rhs[2])
            return lhs[3] < rhs[3];
        else
            return lhs[2] < rhs[2];
        else
            return lhs[1] < rhs[1];
        else
            return lhs[0] < rhs[0];
    }
}; 

__global__ void prepare_ips_list(unsigned char ** dev_sorted_Ips, unsigned char * ip_b1, unsigned char * ip_b2, unsigned char * ip_b3, unsigned char * ip_b4, unsigned int searchedIpsSize)
{
    int thread = threadIdx.x + blockIdx.x * blockDim.x;
    if (thread < searchedIpsSize)
    {
        dev_sorted_Ips[thread] = new unsigned char[4];
        dev_sorted_Ips[thread][0] = ip_b1[thread];
        dev_sorted_Ips[thread][1] = ip_b2[thread];
        dev_sorted_Ips[thread][2] = ip_b3[thread];
        dev_sorted_Ips[thread][3] = ip_b4[thread];
    }

}


int main()
{
    const int size = 1000000;

    unsigned char * ip_b1 = new unsigned char[size];
    unsigned char * ip_b2 = new unsigned char[size];;
    unsigned char * ip_b3 = new unsigned char[size];;
    unsigned char * ip_b4 = new unsigned char[size];;

    unsigned char * dev_ip_b1;
    unsigned char * dev_ip_b2;
    unsigned char * dev_ip_b3;
    unsigned char * dev_ip_b4;

    unsigned char ** dev_sortedIps;

    for (int i = 0; i < size; i++)
    {
        ip_b1[i] = rand() % 240;
        ip_b2[i] = rand() % 240;
        ip_b3[i] = rand() % 240;
        ip_b4[i] = rand() % 240;
    }

    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_ip_b1, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b1, ip_b1, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b2, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b2, ip_b2, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b3, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b3, ip_b3, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b4, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b4, ip_b4, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_sortedIps, size * sizeof(unsigned char *));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    int resetThreads = size;
    int resetBlocks = 1;
    if (size > 1024)
    {
        resetThreads = 1024;
        resetBlocks = size / 1024;
        if (size % 1024 > 0)
            resetBlocks++;
    }

    prepare_ips_list << <resetBlocks, resetThreads >> >(dev_sortedIps, dev_ip_b1, dev_ip_b2, dev_ip_b3, dev_ip_b4, size);



    thrust::device_ptr<unsigned char *> sorted_list_ptr1(dev_sortedIps);
    thrust::stable_sort(sorted_list_ptr1, sorted_list_ptr1 + size, vector_less<unsigned char *>());

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching !\n", cudaStatus);
        goto Error;
    }

    return 0;

Error:
    cudaFree(dev_ip_b1);
    cudaFree(dev_ip_b2);
    cudaFree(dev_ip_b3);
    cudaFree(dev_ip_b4);
    cudaFree(dev_sortedIps);
}

the error I got is : Microsoft C++ exception: thrust::system::system_error at memory location

how to solve this problem with big arrays? should I use another technique to achieve this sorting such as dividing and sorting for the parts then merging?

Solution

The proximal issue is that in-kernel malloc and new are limited in the size of the device heap that they have available to allocate from. This limit can be raised. Please read the documentation.

A few other suggestions:

You're not doing any error checking after your kernel (before the first thrust call). You should do error checking on the kernel, then you would discover that your kernel is what is failing, and thrust is merely reporting the error for you. Avoid the confusion. Do rigorous, proper cuda error checking any time you are having trouble with a CUDA code.
As a good practice, it's not a bad idea, at least for debugging purposes, to test any pointer return by new or malloc for NULL. This is how the API informs you that an allocation failure occurred.

The code below demonstrates a possible workaround for the proximal issue, by adjusting the device heap for the input size. It also demonstrates possible ways to address the other two suggestions:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <device_functions.h>
#include <assert.h>

template<typename T>
struct vector_less
{
    typedef T first_argument_type;
    typedef T second_argument_type;
    typedef bool result_type;
    __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
        if (lhs[0] == rhs[0])
        if (lhs[1] == rhs[1])
        if (lhs[2] == rhs[2])
            return lhs[3] < rhs[3];
        else
            return lhs[2] < rhs[2];
        else
            return lhs[1] < rhs[1];
        else
            return lhs[0] < rhs[0];
    }
};

__global__ void prepare_ips_list(unsigned char ** dev_sorted_Ips, unsigned char * ip_b1, unsigned char * ip_b2, unsigned char * ip_b3, unsigned char * ip_b4, unsigned int searchedIpsSize)
{
    int thread = threadIdx.x + blockIdx.x * blockDim.x;
    if (thread < searchedIpsSize)
    {
        dev_sorted_Ips[thread] = new unsigned char[4];
        if (dev_sorted_Ips[thread] == NULL) assert(0);
        dev_sorted_Ips[thread][0] = ip_b1[thread];
        dev_sorted_Ips[thread][1] = ip_b2[thread];
        dev_sorted_Ips[thread][2] = ip_b3[thread];
        dev_sorted_Ips[thread][3] = ip_b4[thread];
    }

}


int main(int argc, char *argv[])
{

    int size = 50000;
    if (argc > 1) size = atoi(argv[1]);
    int chunks = size/50000 + 1;
    cudaError_t cudaStatus;
    cudaStatus = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8000000 * chunks);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "set device heap limit failed!");
    }
    unsigned char * ip_b1 = new unsigned char[size];
    unsigned char * ip_b2 = new unsigned char[size];;
    unsigned char * ip_b3 = new unsigned char[size];;
    unsigned char * ip_b4 = new unsigned char[size];;

    unsigned char * dev_ip_b1;
    unsigned char * dev_ip_b2;
    unsigned char * dev_ip_b3;
    unsigned char * dev_ip_b4;

    unsigned char ** dev_sortedIps;

    for (int i = 0; i < size; i++)
    {
        ip_b1[i] = rand() % 240;
        ip_b2[i] = rand() % 240;
        ip_b3[i] = rand() % 240;
        ip_b4[i] = rand() % 240;
    }

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    }

    cudaStatus = cudaMalloc((void**)&dev_ip_b1, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }
    cudaStatus = cudaMemcpy(dev_ip_b1, ip_b1, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b2, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }
    cudaStatus = cudaMemcpy(dev_ip_b2, ip_b2, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b3, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }
    cudaStatus = cudaMemcpy(dev_ip_b3, ip_b3, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b4, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }
    cudaStatus = cudaMemcpy(dev_ip_b4, ip_b4, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    cudaStatus = cudaMalloc((void**)&dev_sortedIps, size * sizeof(unsigned char *));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }

    int resetThreads = size;
    int resetBlocks = 1;
    if (size > 1024)
    {
        resetThreads = 1024;
        resetBlocks = size / 1024;
        if (size % 1024 > 0)
            resetBlocks++;
    }

    prepare_ips_list << <resetBlocks, resetThreads >> >(dev_sortedIps, dev_ip_b1, dev_ip_b2, dev_ip_b3, dev_ip_b4, size);

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess){
      printf(" kernel fail\n");
      exit(0);}

    thrust::device_ptr<unsigned char *> sorted_list_ptr1(dev_sortedIps);
    thrust::stable_sort(sorted_list_ptr1, sorted_list_ptr1 + size, vector_less<unsigned char *>());

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching !\n", cudaStatus);
    }

    return 0;

}

Note that you can test various sizes by passing the desired size as a command line parameter. I tested up to 1000000, it seemed to work fine. Eventually, for a large enough problem size, you will run out of memory on your GPU. You don't indicate what GPU you have.

I've removed the goto statements, because I am working on linux (apparently you've switched back to windows). I would suggest you come up with a different error handling process than using goto, if for no other reason than that it causes difficulty with thrust constructs.

Also note that in-kernel new or malloc is kind of "slow". You could probably speed this up for larger sizes substantially by doing your necessary allocation up-front, with a single cudaMalloc call of the appropriate size. Unfortunately this is complicated by your use of the double-pointer array dev_sorted_Ips. I would suggest that you instead flatten that to a single pointer array, allocate the necessary size once via cudaMalloc, and do the necessary array indexing in your kernel to make it work. If you profile this code, you'll discover that the vast majority of execution time for longer cases (e.g. size = 1000000) is consumed by your prepare_ips_list kernel, not the sorting operation. So the focus of your efforts for performance improvement should begin there.