I am trying to use the thrust::copy_if function on mapped memory. However, as I get a runtime error and I am not being able to find it, before spending a lot of time in debugging, I would like to have a confirmation of the fact that it is effectively allowed to pass a pointer to a mapped memory location to the thrust::device_ptr wrapper.
Here is an example of what I mean:
int size=1024;
int* v_locked;
int* v_device;
int* stencil_device;
device_ptr<int> v_wrapper;
device_ptr<int> v_wrapper_end;
device_ptr<int> stencil_wrapper;
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped));
cudaHostGetDevicePointer(&v_device, &v_locked, 0);
cudaMalloc((void**)&stencil_device, size*sizeof(int));
/*
kernel assigning stencil_device elements ...
*/
v_wrapper = device_pointer_cast(v_device);
stencil_wrapper = device_pointer_cast(stencil_device);
v_wrapper_end = copy_if(make_counting_iterator<int>(0), make_counting_iterator<int>(size), stencil_wrapper, v_wrapper, _1 == 1);
Is this a correct usage of mapped memory with thrust library?
Thank you.
Yes, it is possible.
I believe there were several problems with your code.
cudaHostGetDevicePointer
seem to compile correctly, they were not set up correctly.cudaHostGetDevicePointer()
were not set up correctly. The second pointer argument is passed as a single pointer (*
), not double pointer (**
). Refer to the documentation This call as written would throw a cuda runtime error which you can trap.cudaHostAlloc
calls, you should use the cudaSetDeviceFlags(cudaDeviceMapHost);
call to enable this feature. Here is a sample code which seems to work correctly for me, and has the above problems fixed:
$ cat t281.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
template<typename T>
struct is_one : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x==1);
}
};
int main(){
int size=1024;
int* v_locked;
int* v_device;
int* stencil_locked;
int* stencil_device;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags");
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 1");
cudaHostGetDevicePointer(&v_device, v_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 1");
cudaHostAlloc((void**)&stencil_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 2");
cudaHostGetDevicePointer(&stencil_device, stencil_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 2");
for (int i = 0; i < size; i++){
v_locked[i] = i;
stencil_locked[i] = i%2;}
thrust::device_ptr<int> v_wrapper = thrust::device_pointer_cast(v_device);
thrust::device_ptr<int> stencil_wrapper = thrust::device_pointer_cast(stencil_device);
thrust::device_ptr<int> v_wrapper_end = v_wrapper + size;
thrust::device_vector<int> result(size);
thrust::device_vector<int>::iterator result_end = copy_if(v_wrapper, v_wrapper_end, stencil_wrapper, result.begin(), is_one<int>());
int result_size = result_end - result.begin();
thrust::host_vector<int> h_result(result_size);
thrust::copy_n(result.begin(), result_size, h_result.begin());
thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -arch=sm_20 -o t281 t281.cu
$ ./t281
1 3 5 7 9 11 13 15 17 19
$