Need help optimizing thrust cuda code with nested iterator transform_reduce operations

I am working on code I would like to execute efficiently on a GPU. Most of the code has been easy to vectorize and prepare for parallel execution. There are several nice examples on Stack Overflow that have helped me with the standard nested iterators. I have one section I have not been able to successfully condense into an efficient thrust construct. I have taken that section of my code and made a minimum reproducible example. Any advice or hint on how to structure this code would be appreciated.

Thanks

#include <algorithm>
#include <iostream>
#include <numeric>
#include <vector>
#include <ctime>

#include <thrust/reduce.h>
#include <thrust/device_vector.h>

typedef  thrust::device_vector<double> tDoubleVecDevice;
typedef  tDoubleVecDevice::iterator tDoubleVecDeviceIter;

struct functorB{

    template <typename T>
__host__ __device__
    double operator()(const T &my_tuple){  // do some math
           return ( fmod((thrust::get<0>(my_tuple) * thrust::get<1>(my_tuple)),1.0) );
    }
};
struct functorC {

    template <typename T>
__host__ __device__
    double operator()(const T &my_tuple){   // do some math
           double distance = fabs( fmod((thrust::get<0>(my_tuple) - thrust::get<1>(my_tuple)),1.0));
           return((fmin( distance, 1.0 - distance)) / (5.0));
    }
};

int main(void)
{

    tDoubleVecDevice resF(36);
    tDoubleVecDevice freqI(36);


    tDoubleVecDevice trialTs(128);

    std::srand(std::time(nullptr));
    for(tDoubleVecDeviceIter tIter = trialTs.begin();tIter < trialTs.end(); tIter++ ){
        (*tIter) = rand() % 10 + 1.5;  // make some random numbers
    }
    for(tDoubleVecDeviceIter rIter = resF.begin(), fIter = freqI.begin();fIter < resF.end(); rIter++ ,fIter++){
        (*fIter) = rand() % 10 + 1.5;  // make some random numbers
        (*rIter) = rand() % 10 + 1.5;  // make some random numbers
    }

    tDoubleVecDevice trialRs(36);
    tDoubleVecDevice errorVect(128);


     for( tDoubleVecDeviceIter itTrial = trialTs.begin(), itError = errorVect.begin(); itTrial != trialTs.end(); itTrial++,itError++){


            thrust::transform( (thrust::make_zip_iterator(thrust::make_tuple(thrust::make_constant_iterator<double>(*itTrial), freqI.begin()))),
                               (thrust::make_zip_iterator(thrust::make_tuple(thrust::make_constant_iterator<double>(*itTrial)+36, freqI.end()))),
                               trialRs.begin() ,functorB());

            (*itError) =thrust::transform_reduce(
                                             thrust::make_zip_iterator(thrust::make_tuple(trialRs.begin(),resF.begin())),
                                             thrust::make_zip_iterator(thrust::make_tuple(trialRs.end(),resF.end())),
                                             functorC(),(double) 0,thrust::plus<double>()
                                            );
     }
// finds the index of the minimum element;
    int  minElementIndex = thrust::min_element(errorVect.begin(),errorVect.end()) - errorVect.begin();
    double  result = trialTs[minElementIndex];

    std::cout << "result = " << result;

    return 0;
}

Solution

It looks like you need to expand your trialsTs,trialsRs,errorVect,freqI and resF vectors to 4608 elements. This will allow you to vectorize the loops. Derive a class from thrust::iterator_adaptor to make a cyclic iterator to expand your freqI and resF to create repeated sequences of the data in those vectors.

After you run your functors use a reduce by key transform to create your error result with each 36 element trial.

Give that a try and if you get stuck I will provide some additional code.