CUDA/Thrust: How to sum the columns of an interleaved array?

Using Thrust it's straightforward to sum the rows of an flattened (i.e. backed by a vector) matrix, as shown in the example here.

What I would like to do instead is sum the columns of the array.

I tried using a similar construction, i.e.:

// convert a linear index to a column index
template <typename T>
struct linear_index_to_col_index : public thrust::unary_function<T,T>
{
  T C; // number of columns

  __host__ __device__
  linear_index_to_col_index(T C) : C(C) {}

  __host__ __device__
  T operator()(T i)
  {
    return i % C;
  }
};

// allocate storage for column sums and indices
thrust::device_vector<int> col_sums(C);
thrust::device_vector<int> col_indices(C);

// compute row sums by summing values with equal row indices
thrust::reduce_by_key
  (thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_col_index<int>(C)),
   thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_col_index<int>(C)) + (R*C),
   array.begin(),
   col_indices.begin(),
   col_sums.begin(),
   thrust::equal_to<int>(),
   thrust::plus<int>());

However this results in only summing the first column, the rest are ignored. My guess for why this happens is that, as noted in the reduce_by_key docs:

For each group of consecutive keys in the range [keys_first, keys_last)that are equal,reduce_by_keycopies the first element of the group to thekeys_output`. [Emphasis mine]

If my understanding is correct, because the keys in the row iterator are consecutive (i.e. indexes [0 - (C-1)] will give 0, then [C - (2C-1)] will give 1 and so on), they end up being summed together.

But the column iterator will map indexes [0 - (C-1)] to [0 - (C-1)] and then start again, indexes [C - (2C-1)] will be mapped to [0 - (C-1)] etc. making the values produced non-consecutive.

This behavior is un-intuitive for me, I expected all data points assigned to the same key to be grouped together, but that's another discussion.

In any case, my question is: How would one sum the columns of an interleaved array using Thrust?

Solution

These operations (summing rows, summing columns, etc.) are generally memory-bandwidth bound on the GPU. Therefore, we may want to consider how to construct an algorithm that will make optimal use of the GPU memory bandwidth. In particular we would like our underlying memory accesses generated from thrust code to be coalesced, if possible. In a nutshell, this means that adjacent GPU threads will read from adjacent locations in memory.

The original row-summing example displays this property: adjacent threads spawned by thrust will read adjacent elements in memory. For example, if we have R rows, then we can see that the first R threads created by thrust will all be reading the first "row" of the matrix, during the reduce_by_key operation. Since the memory locations associated with the first row are all grouped together, we get coalesced access.

One approach to solving this problem (how to sum the columns) would be to use a similar strategy to the row-summing example, but use a permutation_iterator to cause the threads that are all part of the same key sequence to read a column of data instead of a row of data. This permutation iterator would take the underlying array and also a mapping sequence. This mapping sequence is created by a transform_iterator using a special functor applied to a counting_iterator, to convert the linear (row-major) index into a column-major index, so that the first C threads will read the elements of the first column of the matrix, instead of the first row. Since the first C threads will belong to the same key-sequence, they will get summed together in the reduce_by_key operation. This is what I call method 1 in the code below.

However, this method suffers the drawback that adjacent threads are no longer reading adjacent values in memory - we have broken coalescing, and as we shall see, the performance impact is noticeable.

For large matrices stored in row-major order in memory (the ordering we have been discussing in this problem), a fairly optimal method of summing the columns is to have each thread sum an individual column with a for-loop. This is fairly straightforward to implement in CUDA C, and we can similarly perform this operation in Thrust with an appropriately defined functor.

I'm referring to this as method 2 in the code below. This method will only launch as many threads as there are columns in the matrix. For a matrix with a sufficiently large number of columns (say 10,000 or more) this method will saturate the GPU and efficiently use the available memory bandwidth. If you inspect the functor, you will see that is a somewhat "unusual" adaptation of thrust, but perfectly legal.

Here's the code comparing both methods:

$ cat t994.cu
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>

#include <iostream>

#define NUMR 1000
#define NUMC 20000
#define TEST_VAL 1

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}


typedef int mytype;

// from a linear (row-major) index, return column-major index
struct rm2cm_idx_functor : public thrust::unary_function<int, int>
{
  int r;
  int c;

  rm2cm_idx_functor(int _r, int _c) : r(_r), c(_c) {};

  __host__ __device__
  int operator() (int idx)  {
    unsigned my_r = idx/c;
    unsigned my_c = idx%c;
    return (my_c * r) + my_r;
  }
};


// convert a linear index to a column index
template <typename T>
struct linear_index_to_col_index : public thrust::unary_function<T,T>
{
  T R; // number of rows

  __host__ __device__
  linear_index_to_col_index(T R) : R(R) {}

  __host__ __device__
  T operator()(T i)
  {
    return i / R;
  }
};

struct sum_functor
{
  int R;
  int C;
  mytype *arr;

  sum_functor(int _R, int _C, mytype *_arr) : R(_R), C(_C), arr(_arr) {};

  __host__ __device__
  mytype operator()(int myC){
    mytype sum = 0;
      for (int i = 0; i < R; i++) sum += arr[i*C+myC];
    return sum;
    }
};



int main(){
  int C = NUMC;
  int R = NUMR;
  thrust::device_vector<mytype> array(R*C, TEST_VAL);

// method 1: permutation iterator

// allocate storage for column sums and indices
  thrust::device_vector<mytype> col_sums(C);
  thrust::device_vector<int> col_indices(C);

// compute column sums by summing values with equal column indices
  unsigned long long m1t = dtime_usec(0);
  thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_col_index<int>(R)),
   thrust::make_transform_iterator(thrust::counting_iterator<int>(R*C), linear_index_to_col_index<int>(R)),
   thrust::make_permutation_iterator(array.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), rm2cm_idx_functor(R, C))),
   col_indices.begin(),
   col_sums.begin(),
   thrust::equal_to<int>(),
   thrust::plus<int>());
  cudaDeviceSynchronize();
  m1t = dtime_usec(m1t);
  for (int i = 0; i < C; i++)
    if (col_sums[i] != R*TEST_VAL) {std::cout << "method 1 mismatch at: " << i << " was: " << col_sums[i] << " should be: " << R*TEST_VAL << std::endl; return 1;}
  std::cout << "Method1 time: " << m1t/(float)USECPSEC << "s" << std::endl;

// method 2: column-summing functor

  thrust::device_vector<mytype> fcol_sums(C);
  thrust::sequence(fcol_sums.begin(), fcol_sums.end());  // start with column index
  unsigned long long m2t = dtime_usec(0);
  thrust::transform(fcol_sums.begin(), fcol_sums.end(), fcol_sums.begin(), sum_functor(R, C, thrust::raw_pointer_cast(array.data())));
  cudaDeviceSynchronize();
  m2t = dtime_usec(m2t);
  for (int i = 0; i < C; i++)
    if (fcol_sums[i] != R*TEST_VAL) {std::cout << "method 2 mismatch at: " << i << " was: " << fcol_sums[i] << " should be: " << R*TEST_VAL << std::endl; return 1;}
  std::cout << "Method2 time: " << m2t/(float)USECPSEC << "s" << std::endl;
  return 0;
}
$ nvcc -O3 -o t994 t994.cu
$ ./t994
Method1 time: 0.034817s
Method2 time: 0.00082s
$

It's evident that for a sufficiently large matrix, method 2 is substantially faster than method 1.

If you're unfamiliar with permutation iterators, take a look at the thrust quick start guide.