I am currently trying to parallelize thrust cuda code that currently runs sequentially in a main function (and therefore does not harness the power of the GPU). I have essentially taken functional code an put it into a functor that thrust::for_each can call using cuda streams. However if I define the functor using
__host__ __device__
VS2013 throws all sorts of warnings saying that I am trying to launch host functions from the device. These errors are occurring in places were I am defining a vector using
thrust::device_vector vect (size_vector);
as well as some thrust::transform functions. It specifically quotes problems with the thrust::device_malloc_allocator. If I define the functor as strictly a host functor these errors all go away, however when I use the profiler it becomes evident that only 0.01% of the device is being used leading me to believe for_each is not actually launching the thrust code in the functor.
EDIT below is some code that compiles and shows this error
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/sequence.h>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <algorithm>
#include <memory.h>
#include <cstdio>
#include <thread>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
using namespace std;
const int num_segs = 1; // number of segments to sort
const int num_vals = 5; // number of values in each segment
template <typename T>
struct sort_vector
{
T *Ddata;
T *vect3;
T *answer;
sort_vector(T *_Ddata, T *_vect3, float *a) : Ddata(_Ddata), vect3(_vect3), answer(a) {};
__host__ __device__ void operator()(int idx)
{
thrust::sort(thrust::seq, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
thrust::device_ptr<float> vect3_ptr = thrust::device_pointer_cast(vect3);
thrust::device_vector<float> vect(10, 1);
thrust::device_vector<float> vect2(10, 3);
thrust::transform(thrust::device, vect.begin(), vect.end(), vect2.begin(), vect3_ptr, thrust::minus<float>());
*answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
}
};
int main() {
thrust::device_vector<float> d_Ddata(num_segs*num_vals);
d_Ddata[0] = 50;
d_Ddata[1] = 9.5;
d_Ddata[2] = 30;
d_Ddata[3] = 8.1;
d_Ddata[4] = 1;
thrust::device_vector<float> d_Ddata2(num_segs*num_vals);
d_Ddata2[0] = 50;
d_Ddata2[1] = 20.5;
d_Ddata2[2] = 70;
d_Ddata2[3] = 8.1;
d_Ddata2[4] = 1;
thrust::device_vector<float> vect3(10, 0);
thrust::device_vector<float> vect4(10, 0);
cout << "original dut" << endl;
int g = 0;
while (g < num_segs*num_vals){
cout << d_Ddata[g] << endl;
g++;
}
thrust::device_vector<int> d_idxs(num_segs);
thrust::sequence(d_idxs.begin(), d_idxs.end());
thrust::device_vector<float> dv_answer(1);
thrust::device_vector<float> dv_answer2(1);
cudaStream_t s1, s2;
cudaStreamCreate(&s1);
cudaStreamCreate(&s2);
clock_t start;
double duration;
start = clock();
thrust::for_each(thrust::cuda::par.on(s1),
d_idxs.begin(),
d_idxs.end(), sort_vector<float>(thrust::raw_pointer_cast(d_Ddata.data()), thrust::raw_pointer_cast(vect3.data()), thrust::raw_pointer_cast(dv_answer.data())));
thrust::for_each(thrust::cuda::par.on(s2),
d_idxs.begin(),
d_idxs.end(), sort_vector<float>(thrust::raw_pointer_cast(d_Ddata2.data()), thrust::raw_pointer_cast(vect4.data()), thrust::raw_pointer_cast(dv_answer2.data())));
cudaStreamSynchronize(s1);
cudaStreamSynchronize(s2);
cout << "sorted dut" << endl;
int n = 0;
while (n < num_segs*num_vals){
cout << d_Ddata[n] << endl;
n++;
}
cout << "sum" << endl;
cout << dv_answer[0] << endl;
cout << dv_answer2[0] << endl;
cout << "vector subtraction" << endl;
int e = 0;
while (e < 10){
cout << vect3[e] << endl;
e++;
}
cudaStreamDestroy(s1);
cudaStreamDestroy(s2);
duration = (clock() - start) / (double)CLOCKS_PER_SEC;
cout << "time " << duration << endl;
cin.get();
return 0;
}
Is it possible that thrust::for_each cannot call __host__
functors?
Are some thrust calls innately connected to the host behind the scenes?
The only potential workaround I can see is creating a __host__ __device__
fucntor that has separate host and device defined code within it. It is also possible that I have missed something while researching this subject. Any advice would be greatly appreciated.
These errors are occurring in places were I am defining a vector
As the compiler is clearly telling you, the problem is that the constructor and all the operators defined within thrust::vector
are currently host only functions. It is illegal to try to use them in a __device__
function.
There is no solution other than not attempting to instantiate a vector within device code.