Search code examples
parallel-processingcudagpgputhrust

How make a stride chunk iterator thrust cuda


I need a class iterator like this

https://github.com/thrust/thrust/blob/master/examples/strided_range.cu

but that this new iterator do the next sequence

[k * size_stride, k * size_stride+1, ...,k * size_stride+size_chunk-1,...]

with

k = 0,1,...,N

Example:

size_stride = 8
size_chunk = 3
N = 3

then the sequence is

[0,1,2,8,9,10,16,17,18,24,25,26]

I don't know how do this efficiently...


Solution

  • The strided range interator is basically a carefully crafted permutation iterator with a functor that gives the appropriate indices for permutation.

    Here is a modification to the strided range iterator example. The main changes were:

    1. include the chunk size as an iterator parameter
    2. modify the functor that provides the indices for the permutation iterator to spit out the desired sequence
    3. adjust the definitions of .end() iterator to provide the appropriate length of sequence.

    Worked example:

    $ cat t1280.cu
    #include <thrust/iterator/counting_iterator.h>
    #include <thrust/iterator/transform_iterator.h>
    #include <thrust/iterator/permutation_iterator.h>
    #include <thrust/functional.h>
    #include <thrust/fill.h>
    #include <thrust/device_vector.h>
    #include <thrust/copy.h>
    #include <thrust/sequence.h>
    #include <iostream>
    #include <assert.h>
    
    // this example illustrates how to make strided-chunk access to a range of values
    // examples:
    //   strided_chunk_range([0, 1, 2, 3, 4, 5, 6], 1,1) -> [0, 1, 2, 3, 4, 5, 6]
    //   strided_chunk_range([0, 1, 2, 3, 4, 5, 6], 2,1) -> [0, 2, 4, 6]
    //   strided_chunk_range([0, 1, 2, 3, 4, 5, 6], 3,2) -> [0 ,1, 3, 4, 6]
    //   ...
    
    template <typename Iterator>
    class strided_chunk_range
    {
        public:
    
        typedef typename thrust::iterator_difference<Iterator>::type difference_type;
    
        struct stride_functor : public thrust::unary_function<difference_type,difference_type>
        {
            difference_type stride;
            int chunk;
            stride_functor(difference_type stride, int chunk)
                : stride(stride), chunk(chunk) {}
    
            __host__ __device__
            difference_type operator()(const difference_type& i) const
            {
                int pos = i/chunk;
                return ((pos * stride) + (i-(pos*chunk)));
            }
        };
    
        typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
        typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
        typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
    
        // type of the strided_range iterator
        typedef PermutationIterator iterator;
    
        // construct strided_range for the range [first,last)
        strided_chunk_range(Iterator first, Iterator last, difference_type stride, int chunk)
            : first(first), last(last), stride(stride), chunk(chunk) {assert(chunk<=stride);}
    
        iterator begin(void) const
        {
            return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride, chunk)));
        }
    
        iterator end(void) const
        {
            int lmf = last-first;
            int nfs = lmf/stride;
            int rem = lmf-(nfs*stride);
            return begin() + (nfs*chunk) + ((rem<chunk)?rem:chunk);
        }
    
        protected:
        Iterator first;
        Iterator last;
        difference_type stride;
        int chunk;
    };
    
    int main(void)
    {
        thrust::device_vector<int> data(50);
        thrust::sequence(data.begin(), data.end());
    
        typedef thrust::device_vector<int>::iterator Iterator;
    
        // create strided_chunk_range
        std::cout << "stride 3, chunk 2, length 7" << std::endl;
        strided_chunk_range<Iterator> scr1(data.begin(), data.begin()+7, 3, 2);
        thrust::copy(scr1.begin(), scr1.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
        std::cout << "stride 8, chunk 3, length 50" << std::endl;
        strided_chunk_range<Iterator> scr(data.begin(), data.end(), 8, 3);
        thrust::copy(scr.begin(), scr.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
    
        return 0;
    }
    $ nvcc -arch=sm_35 -o t1280 t1280.cu
    $ ./t1280
    stride 3, chunk 2, length 7
    0 1 3 4 6
    stride 8, chunk 3, length 50
    0 1 2 8 9 10 16 17 18 24 25 26 32 33 34 40 41 42 48 49
    $
    
    1. This is probably not the most optimal implementation, in particular because we are doing division in the permutation functor, but it should get you started.

    2. I assume (and test for) chunk<=stride, because this seemed reasonable to me, and simplified my thought process. I'm sure it could be modified, with an appropriate example of what sequence you would like to see, for the case where chunk>stride.