I am working on code I would like to execute efficiently on a GPU. Most of the code has been easy to vectorize and prepare for parallel execution. There are several nice examples on Stack Overflow that have helped me with the standard nested iterators. I have one section I have not been able to successfully condense into an efficient thrust construct. I have taken that section of my code and made a minimum reproducible example. Any advice or hint on how to structure this code would be appreciated.
Thanks
#include <algorithm>
#include <iostream>
#include <numeric>
#include <vector>
#include <ctime>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
typedef thrust::device_vector<double> tDoubleVecDevice;
typedef tDoubleVecDevice::iterator tDoubleVecDeviceIter;
struct functorB{
template <typename T>
__host__ __device__
double operator()(const T &my_tuple){ // do some math
return ( fmod((thrust::get<0>(my_tuple) * thrust::get<1>(my_tuple)),1.0) );
}
};
struct functorC {
template <typename T>
__host__ __device__
double operator()(const T &my_tuple){ // do some math
double distance = fabs( fmod((thrust::get<0>(my_tuple) - thrust::get<1>(my_tuple)),1.0));
return((fmin( distance, 1.0 - distance)) / (5.0));
}
};
int main(void)
{
tDoubleVecDevice resF(36);
tDoubleVecDevice freqI(36);
tDoubleVecDevice trialTs(128);
std::srand(std::time(nullptr));
for(tDoubleVecDeviceIter tIter = trialTs.begin();tIter < trialTs.end(); tIter++ ){
(*tIter) = rand() % 10 + 1.5; // make some random numbers
}
for(tDoubleVecDeviceIter rIter = resF.begin(), fIter = freqI.begin();fIter < resF.end(); rIter++ ,fIter++){
(*fIter) = rand() % 10 + 1.5; // make some random numbers
(*rIter) = rand() % 10 + 1.5; // make some random numbers
}
tDoubleVecDevice trialRs(36);
tDoubleVecDevice errorVect(128);
for( tDoubleVecDeviceIter itTrial = trialTs.begin(), itError = errorVect.begin(); itTrial != trialTs.end(); itTrial++,itError++){
thrust::transform( (thrust::make_zip_iterator(thrust::make_tuple(thrust::make_constant_iterator<double>(*itTrial), freqI.begin()))),
(thrust::make_zip_iterator(thrust::make_tuple(thrust::make_constant_iterator<double>(*itTrial)+36, freqI.end()))),
trialRs.begin() ,functorB());
(*itError) =thrust::transform_reduce(
thrust::make_zip_iterator(thrust::make_tuple(trialRs.begin(),resF.begin())),
thrust::make_zip_iterator(thrust::make_tuple(trialRs.end(),resF.end())),
functorC(),(double) 0,thrust::plus<double>()
);
}
// finds the index of the minimum element;
int minElementIndex = thrust::min_element(errorVect.begin(),errorVect.end()) - errorVect.begin();
double result = trialTs[minElementIndex];
std::cout << "result = " << result;
return 0;
}
It looks like you need to expand your trialsTs,trialsRs,errorVect,freqI and resF vectors to 4608 elements. This will allow you to vectorize the loops. Derive a class from thrust::iterator_adaptor to make a cyclic iterator to expand your freqI and resF to create repeated sequences of the data in those vectors.
After you run your functors use a reduce by key transform to create your error result with each 36 element trial.
Give that a try and if you get stuck I will provide some additional code.