Search code examples
c++cudathrust

In CUDA / Thrust, how can I access a vector element's neighbor during a for-each operation?


I am trying to do some scientific simulation using Thrust library in CUDA, but I got stuck in the following operation which is basically a for-each loop:

device_vector<float> In(N);

for-each In(x) in In
      Out(x) = some_calculation(In(x-1),In(x),In(x+1));
end

I have already looked up stackoverflow.com and find some similar questions: Similar questions 1

But it seems using a transform iterator is only possible when the some_calculation function is done between 2 parameters, for transform iterator passes two parameters at most.

Then, for question 2: Similar questions 2

The discussion just ended without a conclusion.

I believe this is a simple problem because it's a natural requirements for parallel calculation. Anyone could tell me what to do?


Solution

  • Fancy iterators are the key to this sort of operation, which isn't all that intuitive in thrust. You can use the zip_iterator to create tuples of values which can then be iterated over, so for a typical f(x[i-1], x[i], x[i+1]) type function, you get something like this:

    #include <iostream>
    #include <cmath>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/tuple.h>
    #include <thrust/transform.h>
    
    struct divided_diff {
        float dx;
        divided_diff(float _dx) : dx(_dx) {};
    
        float operator()(const thrust::tuple<float, float, float> &in) const {
            float y0 = in.get<0>();
            float y1 = in.get<1>();
            float y2 = in.get<2>();
    
            return (y0 - 2.f * y1 + y2) / (dx * dx);
        }
    };
    
    int main() {
        const int N = 10;
        const float dx = 0.1f;
        float x[N], y[N], dydx[N];
    
        for (int i = 0; i < N; ++i) {
            x[i] = dx * float(i);
            y[i] = std::sin(x[i]);
            dydx[i] = 0.f;
        }
    
        auto begin = thrust::make_zip_iterator(thrust::make_tuple(&y[0], &y[1], &y[2]));
        auto end = thrust::make_zip_iterator(thrust::make_tuple(&y[N-2], &y[N-1], &y[N]));
    
        divided_diff f(dx);
        thrust::transform(begin, end, &dydx[1], f);
    
        for (int i = 0; i < N; ++i) {
            std::cout << i << " " << dydx[i] << std::endl;
        }
    
        return 0;
    }
    

    Here the functor processes one tuple at a time, where the tuple contains the three inputs from three different starting points in the same array or iterative sequence.


    EDIT: Apparently converting a host version of this code to use device constructs was proving challenging for the originally poster, so here is a version which executes everything on the device using thrust::device_vector as the base container:

    #include <iostream>
    #include <cmath>
    #include <thrust/tuple.h>
    #include <thrust/transform.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/device_vector.h>
    #include <thrust/sequence.h>
    
    struct divided_diff {
        float dx;
        divided_diff(float _dx) : dx(_dx) {};
    
        __device__
        float operator()(const thrust::tuple<float, float, float> &in) {
            float y0 = in.get<0>();
            float y1 = in.get<1>();
            float y2 = in.get<2>();
    
            return (y0 - 2.f*y1 + y2) / (dx * dx);
        }
    };
    
    struct mysinf {
        __device__
        float operator()(const float &x) { 
            return __sinf(x); 
        }
    };
    
    int main()
    {
    
        const int N = 10;
        const float dx = 0.1f;
        thrust::device_vector<float> x(N), y(N), dydx(N-2);
    
        thrust::sequence(x.begin(), x.end(), 0.f, dx); 
        thrust::transform(x.begin(), x.end(), y.begin(), mysinf());
    
        auto start  = thrust::make_zip_iterator(thrust::make_tuple(y.begin(), y.begin()+1, y.begin()+2));
        auto finish = thrust::make_zip_iterator(thrust::make_tuple(y.end()-2, y.end()-1, y.end()));
    
        divided_diff f(dx);
        thrust::transform( start, finish, dydx.begin(), f);
    
        thrust::device_vector<float>::iterator it = dydx.begin();
        for(; it != dydx.end(); ++it) {
            float val = *it;
            std::cout << val << std::endl;
        }
    
        return 0;
    }