In CUDA / Thrust, how can I access a vector element's neighbor during a for-each operation?

I am trying to do some scientific simulation using Thrust library in CUDA, but I got stuck in the following operation which is basically a for-each loop:

device_vector<float> In(N);

for-each In(x) in In
      Out(x) = some_calculation(In(x-1),In(x),In(x+1));
end

I have already looked up stackoverflow.com and find some similar questions: Similar questions 1

But it seems using a transform iterator is only possible when the some_calculation function is done between 2 parameters, for transform iterator passes two parameters at most.

Then, for question 2: Similar questions 2

The discussion just ended without a conclusion.

I believe this is a simple problem because it's a natural requirements for parallel calculation. Anyone could tell me what to do?

Solution

Fancy iterators are the key to this sort of operation, which isn't all that intuitive in thrust. You can use the zip_iterator to create tuples of values which can then be iterated over, so for a typical f(x[i-1], x[i], x[i+1]) type function, you get something like this:

#include <iostream>
#include <cmath>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/tuple.h>
#include <thrust/transform.h>

struct divided_diff {
    float dx;
    divided_diff(float _dx) : dx(_dx) {};

    float operator()(const thrust::tuple<float, float, float> &in) const {
        float y0 = in.get<0>();
        float y1 = in.get<1>();
        float y2 = in.get<2>();

        return (y0 - 2.f * y1 + y2) / (dx * dx);
    }
};

int main() {
    const int N = 10;
    const float dx = 0.1f;
    float x[N], y[N], dydx[N];

    for (int i = 0; i < N; ++i) {
        x[i] = dx * float(i);
        y[i] = std::sin(x[i]);
        dydx[i] = 0.f;
    }

    auto begin = thrust::make_zip_iterator(thrust::make_tuple(&y[0], &y[1], &y[2]));
    auto end = thrust::make_zip_iterator(thrust::make_tuple(&y[N-2], &y[N-1], &y[N]));

    divided_diff f(dx);
    thrust::transform(begin, end, &dydx[1], f);

    for (int i = 0; i < N; ++i) {
        std::cout << i << " " << dydx[i] << std::endl;
    }

    return 0;
}

Here the functor processes one tuple at a time, where the tuple contains the three inputs from three different starting points in the same array or iterative sequence.

EDIT: Apparently converting a host version of this code to use device constructs was proving challenging for the originally poster, so here is a version which executes everything on the device using thrust::device_vector as the base container:

#include <iostream>
#include <cmath>
#include <thrust/tuple.h>
#include <thrust/transform.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>

struct divided_diff {
    float dx;
    divided_diff(float _dx) : dx(_dx) {};

    __device__
    float operator()(const thrust::tuple<float, float, float> &in) {
        float y0 = in.get<0>();
        float y1 = in.get<1>();
        float y2 = in.get<2>();

        return (y0 - 2.f*y1 + y2) / (dx * dx);
    }
};

struct mysinf {
    __device__
    float operator()(const float &x) { 
        return __sinf(x); 
    }
};

int main()
{

    const int N = 10;
    const float dx = 0.1f;
    thrust::device_vector<float> x(N), y(N), dydx(N-2);

    thrust::sequence(x.begin(), x.end(), 0.f, dx); 
    thrust::transform(x.begin(), x.end(), y.begin(), mysinf());

    auto start  = thrust::make_zip_iterator(thrust::make_tuple(y.begin(), y.begin()+1, y.begin()+2));
    auto finish = thrust::make_zip_iterator(thrust::make_tuple(y.end()-2, y.end()-1, y.end()));

    divided_diff f(dx);
    thrust::transform( start, finish, dydx.begin(), f);

    thrust::device_vector<float>::iterator it = dydx.begin();
    for(; it != dydx.end(); ++it) {
        float val = *it;
        std::cout << val << std::endl;
    }

    return 0;
}