Search code examples
cudathrust

Is it possible to use thrust::device_ptr on a mapped array?


I am trying to use the thrust::copy_if function on mapped memory. However, as I get a runtime error and I am not being able to find it, before spending a lot of time in debugging, I would like to have a confirmation of the fact that it is effectively allowed to pass a pointer to a mapped memory location to the thrust::device_ptr wrapper.

Here is an example of what I mean:

int size=1024;

int* v_locked;
int* v_device;
int* stencil_device;

device_ptr<int> v_wrapper;
device_ptr<int> v_wrapper_end;
device_ptr<int> stencil_wrapper;

cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped));
cudaHostGetDevicePointer(&v_device, &v_locked, 0);

cudaMalloc((void**)&stencil_device, size*sizeof(int));
/* 
kernel assigning stencil_device elements ...
*/

v_wrapper = device_pointer_cast(v_device);
stencil_wrapper = device_pointer_cast(stencil_device);

v_wrapper_end = copy_if(make_counting_iterator<int>(0), make_counting_iterator<int>(size), stencil_wrapper, v_wrapper, _1 == 1);

Is this a correct usage of mapped memory with thrust library?

Thank you.


Solution

  • Yes, it is possible.

    I believe there were several problems with your code.

    1. You don't appear to be doing any proper cuda error checking If you were, you would have detected that although your calls to cudaHostGetDevicePointer seem to compile correctly, they were not set up correctly.
    2. As mentioned above, your calls to cudaHostGetDevicePointer() were not set up correctly. The second pointer argument is passed as a single pointer (*), not double pointer (**). Refer to the documentation This call as written would throw a cuda runtime error which you can trap.
    3. Prior to your cudaHostAlloc calls, you should use the cudaSetDeviceFlags(cudaDeviceMapHost); call to enable this feature.

    Here is a sample code which seems to work correctly for me, and has the above problems fixed:

    $ cat t281.cu
    #include <iostream>
    #include <thrust/device_vector.h>
    #include <thrust/device_ptr.h>
    #include <thrust/copy.h>   
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    template<typename T>
    struct is_one : thrust::unary_function<T, bool>
    {
        __host__ __device__
        bool operator()(const T &x)
        {
            return (x==1);
        }
    };
    
    int main(){
    
      int size=1024;
    
      int* v_locked;
      int* v_device;
      int* stencil_locked;
      int* stencil_device;
    
      cudaSetDeviceFlags(cudaDeviceMapHost);
      cudaCheckErrors("cudaSetDeviceFlags");
      cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped);
      cudaCheckErrors("cudaHostAlloc 1");
      cudaHostGetDevicePointer(&v_device, v_locked, 0);
      cudaCheckErrors("cudaHostGetDevicePointer 1");
      cudaHostAlloc((void**)&stencil_locked, size*sizeof(int), cudaHostAllocMapped);
      cudaCheckErrors("cudaHostAlloc 2");
      cudaHostGetDevicePointer(&stencil_device, stencil_locked, 0);
      cudaCheckErrors("cudaHostGetDevicePointer 2");
    
      for (int i = 0; i < size; i++){
        v_locked[i] = i;
        stencil_locked[i] = i%2;}
    
      thrust::device_ptr<int> v_wrapper = thrust::device_pointer_cast(v_device);
      thrust::device_ptr<int> stencil_wrapper = thrust::device_pointer_cast(stencil_device);
      thrust::device_ptr<int> v_wrapper_end = v_wrapper + size;
      thrust::device_vector<int> result(size);
      thrust::device_vector<int>::iterator result_end = copy_if(v_wrapper, v_wrapper_end, stencil_wrapper, result.begin(), is_one<int>());
      int result_size = result_end - result.begin();
      thrust::host_vector<int> h_result(result_size);
      thrust::copy_n(result.begin(), result_size, h_result.begin());
      thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, " "));
      std::cout << std::endl;
      return 0;
    
    }
    $ nvcc -arch=sm_20 -o t281 t281.cu
    $ ./t281
    1 3 5 7 9 11 13 15 17 19
    $