Search code examples
c++cudathrust

Is it necessary do a cudaMalloc and cudaMemcpy with a thrust::device_vector?


I am a newbie with CUDA. I have read that it is necesary to allocate variables with cudaMalloc and then use cudaMemcpy to copy the values to the device variables. Something like this:

__global__ void suma(int *a, int *b, int *c)
{
    *c = *a + *b;
}

int suma_wrapper(int a, int b, int c,int* d_a, int* d_b, int* d_c)
{
    int size = sizeof(int);

    //Reservo espacio en la tarjeta gráfica para las variables de la GPU (DEVICE)
    cudaMalloc((void**) &d_a,size);
    cudaMalloc((void**) &d_b,size);
    cudaMalloc((void**) &d_c,size);

    //Asigno valores para las variables de la CPU (HOST)
    a = 10;
    b = 11;

    //(CPU->GPU)
    cudaMemcpy(d_a,&a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,&b, size, cudaMemcpyHostToDevice);


    //1 block con 1 thread. Notar que se usan variables que ya están en la GPU
    suma<<<1,1>>>(d_a,d_b,d_c);


    cudaMemcpy(&c,d_c, size, cudaMemcpyDeviceToHost);



    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return c;

}

That code works.

Now I want to use the thrust library and I dont know if i have to do the same thing. I have this code:

void boxcount2d_wrapper(std::vector<std::vector<short>> matriz_param, std::vector<int> &n_param, std::vector<int> &r_param)
{
    thrust::host_vector<int> n_host,r_host;
    thrust::device_vector<int> n_device,r_device;

    cudaMalloc((void**) &n_device,0); // They are empty at first
    cudaMalloc((void**) &r_device,0);

    thrust::host_vector<short> matriz_host(width*width);
    thrust::device_vector<short> matriz_device(width*width);
    cudaMemcpy(n_device,n_param, p*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(r_device,r_param, p*sizeof(int), cudaMemcpyHostToDevice);

    for(auto i = 0; i < matriz_param.size(); i++)
    {
        for(auto j = 0; j < matriz_param.size(); j++)
        {
            matriz_host[i+j]  = matriz_param[i][j];
        }
    }

    cudaMalloc((void**) &matriz_device,matriz_device.size());
    cudaMemcpy(matriz_device,&matriz_host, width*width*sizeof(short), cudaMemcpyHostToDevice);

}


That code doesn't compile. I get this error at cudaMemcpy's:

error: no suitable conversion function from "thrust::device_vector<short, thrust::device_allocator<short>>" to "void *" exists

Are thrust::device_vector directly allocate at GPU ?. I don't know what I'm doing wrong.

I'm starting to thinking that it isn't necessary allocate thrust::device_vectors


Solution

  • Thrust does all CUDA API calls for you. So while you can use Thrust algorithms on manually allocated memory or pass the memory from a thrust::device_vector to a kernel, you don't need cudaMalloc and cudaMemcpy, as everything is already included in the standard C++ vector interface.

    The memory allocated by thrust::device_vector lives on the GPU (if you are using one. One can use Thrust for parallelizing on the CPU as well). So the constructor calls cudaMalloc for you.

    For data transfer you can use different thrust::device_vectors and thrust::host_vectors like normal std::vectors (e.g. constructors and operator= are implemented for the different combinations). Thrust knows what to do with each type of vector and will call cudaMemcpy for you. If this isn't explicit enough for you, you can also use thrust::copy.

    Your code could look the following way:

    void boxcount2d_wrapper(std::vector<std::vector<short>> matriz_param, std::vector<int> &n_param, std::vector<int> &r_param)
    {
        thrust::device_vector<int> n_device(n_param);
        thrust::device_vector<int> r_device(r_param);
    
        thrust::host_vector<short> matriz_host(width*width);
    
        for(auto i = 0; i < matriz_param.size(); i++)
        {
            for(auto j = 0; j < matriz_param.size(); j++)
            {
                matriz_host[i+j]  = matriz_param[i][j];
            }
        }
    
        thrust::device_vector<short> matriz_device(matriz_host);
        
        // ...do stuff...
    }
    

    thrust::device_vector actually even has a constructor taking a std::vector, so we don't have to waste time on a unnecessary copy to a thrust::host_vector<int> here. For performance reasons (independent of using Thrust) I would recommend not to use std::vector<std::vector<T>> for matrices. Instead you should use linear memory and use "lexic indexing" (lin_idx = y * width + x;) as you seem to be doing in Thrust. Then you could even get rid of these loops. That being said, Thrust wouldn't be my first choice for matrix operations (where the operation needs row- and/or column-indices), as they are normally more natural to write in a CUDA kernel.