Search code examples
cudathrust

Converting thrust::iterators to and from raw pointers


I want to use Thrust library to calculate prefix sum of device array in CUDA. My array is allocated with cudaMalloc(). My requirement is as follows:

main()  
{  
     Launch kernel 1 on data allocated through cudaMalloc()  
     // This kernel will poplulate some data d.  
     Use thrust to calculate prefix sum of d.  
     Launch kernel 2 on prefix sum.  
}

I want to use Thrust somewhere between my kernels so I need method to convert pointers to device iterators and back.What is wrong in following code?

int main()                                                        
{                                                                 
    int *a;                                                   
    cudaMalloc((void**)&a,N*sizeof(int));   
    thrust::device_ptr<int> d=thrust::device_pointer_cast(a);  
    thrust::device_vector<int> v(N);                    
    thrust::exclusive_scan(a,a+N,v);                          
    return 0;                                                  
}                     

Solution

  • A complete working example from your latest edit would look like this:

    #include <thrust/device_ptr.h>
    #include <thrust/device_vector.h>
    #include <thrust/scan.h>
    #include <thrust/fill.h>
    #include <thrust/copy.h>
    #include <cstdio>
    
    int main()                                                        
    {                                                                 
        const int N = 16;
        int * a;
        cudaMalloc((void**)&a, N*sizeof(int));   
        thrust::device_ptr<int> d = thrust::device_pointer_cast(a);  
        thrust::fill(d, d+N, 2);
        thrust::device_vector<int> v(N);                    
        thrust::exclusive_scan(d, d+N, v.begin());
    
        int v_[N];
        thrust::copy(v.begin(), v.end(), v_);
        for(int i=0; i<N; i++)
            printf("%d %d\n", i, v_[i]);     
    
        return 0;                                                  
    }
    

    The things you got wrong:

    1. N not defined anywhere
    2. passing the raw device pointer a rather than the device_ptr d as the input iterator to exclusive_scan
    3. passing the device_vector v to exclusive_scan rather than the appropriate iterator v.begin()

    Attention to detail was all that is lacking to make this work. And work it does:

    $ nvcc -arch=sm_12 -o thrust_kivekset thrust_kivekset.cu 
    $ ./thrust_kivekset
    
    0 0
    1 2
    2 4
    3 6
    4 8
    5 10
    6 12
    7 14
    8 16
    9 18
    10 20
    11 22
    12 24
    13 26
    14 28
    15 30
    

    Edit:

    thrust::device_vector.data() will return a thrust::device_ptr which points to the first element of the vector. thrust::device_ptr.get() will return a raw device pointer. Therefore

    cudaMemcpy(v_, v.data().get(), N*sizeof(int), cudaMemcpyDeviceToHost);
    

    and

    thrust::copy(v, v+N, v_);
    

    are functionally equivalent in this example.