I am new to C++ and CUDA coding and have written a program that I am hoping parallelize since it is currently only using 25% of the GPU according to the NSIGHT profiler.
Below, I have written a toy program to try and implement cuda streams using thrust::for_each(), but I cannot seem to modify the array. I am accustomed to thrust::transform() where a return array is provided in the call.
All the examples I seem to find are either just using the for_each call to print, or do not do anything with the modified array.
When I run this program it just returns an array of zeros.
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <iterator>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/transform_reduce.h>
#include <thrust/transform.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/tuple.h>
#include <thrust/count.h>
#include <thrust/sequence.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/for_each.h>
#include <ctime>
#include <cstdio>
#include <cassert>
using namespace std;
//define typedef for iterators for shorthand
typedef thrust::device_vector<float>::iterator normIter;
typedef thrust::device_vector<float>::iterator deltaIter;
typedef thrust::device_vector<float>::iterator gammaskIter;
typedef thrust::device_vector<float>::iterator zetmaskIter;
typedef thrust::device_vector<float>::iterator zetvalIter;
//typedef thrust::zip_iterator<tpl2intiter> idxzip;
//typedef a tuple of these iterators
typedef thrust::tuple<normIter, deltaIter, gammaskIter, zetmaskIter, zetvalIter> IteratorTuple;
//typedef the zip_iterator for this tuple
typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
//structure that takes takes the absolute value of a given number
template<typename T>
struct my_function
{
cudaStream_t s;
my_function(cudaStream_t s) : s(s) {}
__host__ __device__ float operator()(thrust::tuple<float, float> x)
{
float y = thrust::get<0>(x);
return thrust::get<1>(x) = y + 5;
}
};
int main() {
clock_t start;
double duration;
start = clock();
thrust::device_vector<float> d_fraction(5);
d_fraction[0] = 1;
d_fraction[1] = 5;
d_fraction[2] = 3;
d_fraction[3] = 2;
d_fraction[4] = 4;
thrust::device_vector<float> d_fraction2(5);
d_fraction2[0] = 0.00;
d_fraction2[1] = 0.04;
d_fraction2[2] = 0.08;
d_fraction2[3] = 0.12;
d_fraction2[4] = 0.16;
cout << "original" << endl;
int f = 0;
while (f < 5){
cout << d_fraction[f] << endl;
f++;
}
cout << "original" << endl;
int y = 0;
while (y < 5){
cout << d_fraction2[y] << endl;
y++;
}
cudaStream_t s1, s2;
cudaStreamCreate(&s1);
cudaStreamCreate(&s2);
thrust::device_vector<float> result1(5);
thrust::device_vector<float> result2(5);
thrust::for_each(thrust::cuda::par.on(s1), thrust::make_zip_iterator(thrust::make_tuple(d_fraction.begin(), result1.begin())),
thrust::make_zip_iterator(thrust::make_tuple(d_fraction.end(), result1.end())), my_function<float>(s1));
thrust::for_each(thrust::cuda::par.on(s2), thrust::make_zip_iterator(thrust::make_tuple(d_fraction2.begin(), result2.begin())),
thrust::make_zip_iterator(thrust::make_tuple(d_fraction2.end(), result2.end())), my_function<float>(s2));
cudaStreamSynchronize(s1);
cudaStreamSynchronize(s2);
cout << "norm" << endl;
int i = 0;
while (i < 5){
cout << result1[i] << endl;
i++;
}
cout << "dut" << endl;
int a = 0;
while (a < 5){
cout << result2[a] << endl;
a++;
}
cudaStreamDestroy(s1);
cudaStreamDestroy(s2);
duration = (clock() - start) / (double)CLOCKS_PER_SEC;
cout << "time " << duration << endl;
cin.get();
return 0;
}
With thrust::transform
, the return value of the functor operator is the value that is assigned to the output iterator. As you've discovered, with thrust::for_each
(which takes only an input iterator), this is not the case.
Therefore when we want to use thrust::for_each
to modify the elements of a vector (as opposed to just printing something out) we must use the provided tuple (input to the functor operator) as our path to do so.
You may have been trying to do that here:
return thrust::get<1>(x) = y + 5;
but this does not work because the return value of the functor operator (in this case a float
so also not matching the iterator dereference type, which is a tuple) is not used for that purpose, and the modification of the input x
tuple is not having the desired effect because your formulation passes the tuple to the functor by value:
__host__ __device__ float operator()(thrust::tuple<float, float> x)
^
tuple passed by value
In C (or C++), when we pass a parameter by value, modifications made to that argument will not show up in the calling environment. The usual solution is to pass by reference, so that modifications made in the functor will show up in the calling environment (ie. in the input vectors, for thrust::for_each
).
If we make an attempt to do so as follows:
__host__ __device__ float operator()(thrust::tuple<float, float> &x)
we get some compile errors which are instructive:
$ nvcc t1188.cu -o t1188
/usr/local/cuda/bin/..//include/thrust/detail/function.h(60): error: function "my_function<T>::operator() [with T=float]" cannot be called with the given argument list
argument types are: (thrust::detail::tuple_of_iterator_references<float &, float &, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>)
NOTE ^^^^^^^^^^^^^^^^
and hopefully lead us to this formulation:
__host__ __device__ float operator()(thrust::tuple<float &, float &> x)
which works and has the desired effect:
$ ./t1188
original
1
5
3
2
4
original
0
0.04
0.08
0.12
0.16
norm
6
10
8
7
9
dut
5
5.04
5.08
5.12
5.16
time 0.488422
A few other things I noticed:
I'm not sure what you are trying to accomplish by passing the stream parameter to the functor. There is nothing you could do with it in the functor, so that is not necessary.
You have a templating definition in front of your my_function
functor which serves no purpose (the template type T
is not used anywhere in the functor that I can see). Perhaps this is just left over from your coding efforts. It's sometimes possible to avoid having to remember this tuple-of-references configuration on the input parameter by templating the type of the input parameter for the functor, but you don't seem to be doing that.