How could I compute the order of elements in each row of a matrix with cuda?

I am finding how could I do argsort with cuda/thrust along rows or colums of a matrix.

It means that given a matrix such as:

A = [[ 3.4257, -1.2345,  0.6232, -0.1354], 
     [-1.6639,  0.1557, -0.1763,  1.0257], 
     [0.6863,  0.0992,  1.4487,  0.0157]].

And I need to compute the order of elements in each row, so the output is:

index = [[1, 3, 2, 0],
         [0, 2, 1, 3],
         [3, 1, 0, 2]]

How could I do this please?

Solution

This is possible using thrust::sort. We need a set of row indices and a set of column indices. The row indices are to make sure the sorting order is segmented among the rows. The column indices are what will give us the result, after sorting.

zip together the value, row index, column index. Create a sort functor that orders rows first, then values. The output is the rearranged column indices.

$ cat t114.cu
#include <thrust/sort.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>

using namespace thrust::placeholders;

struct my_sort_functor
{
  template <typename T1, typename T2>
  __host__ __device__
  bool operator()(const T1 &t1, const T2 &t2){
    if  (thrust::get<1>(t1) < thrust::get<1>(t2)) return true;
    if  (thrust::get<1>(t1) > thrust::get<1>(t2)) return false;
    if  (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
    return false;
  }
};

typedef float mt;
typedef int it;

int main(){

  mt A[] = { 3.4257, -1.2345,  0.6232, -0.1354,
            -1.6639,  0.1557, -0.1763,  1.0257,
             0.6863,  0.0992,  1.4487,  0.0157};
  const int rows = 3;
  const int cols = 4;
  thrust::device_vector<mt> d_A(A, A+rows*cols);
  thrust::device_vector<it> row_idx(d_A.size());
  thrust::device_vector<it> col_idx(d_A.size());
  thrust::sequence(row_idx.begin(), row_idx.end());
  thrust::sequence(col_idx.begin(), col_idx.end());
  thrust::transform(row_idx.begin(), row_idx.end(), row_idx.begin(), _1/cols);
  thrust::transform(col_idx.begin(), col_idx.end(), col_idx.begin(), _1%cols);
  auto my_zip_iterator = thrust::make_zip_iterator(thrust::make_tuple(d_A.begin(), row_idx.begin(), col_idx.begin()));
  thrust::sort(my_zip_iterator, my_zip_iterator+rows*cols, my_sort_functor());
  thrust::host_vector<it> h_col_idx = col_idx;
  thrust::copy_n(h_col_idx.begin(), rows*cols, std::ostream_iterator<it>(std::cout, ","));
  std::cout << std::endl;
}
$ nvcc -o t114 t114.cu
$ ./t114
1,3,2,0,0,2,1,3,3,1,0,2,
$

Here's another approach. This methodology does not reorder the data, but simply produces the ordering result. We only need one index and compute the row index on the fly, and the column index is only computed after the ordered result is determined.

$ cat t114.cu
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <iostream>

using namespace thrust::placeholders;
typedef float mt;
typedef int it;

struct my_sort_functor
{
  mt *d;
  it cols;
  my_sort_functor(mt *_d, it _cols) : d(_d), cols(_cols) {};
  __host__ __device__
  bool operator()(const it &t1, const it &t2){
    it row1 = t1/cols;
    it row2 = t2/cols;
    if  (row1 < row2) return true;
    if  (row1 > row2) return false;
    if  (d[t1] < d[t2]) return true;
    return false;
  }
};

int main(){

  mt A[] = { 3.4257, -1.2345,  0.6232, -0.1354,
            -1.6639,  0.1557, -0.1763,  1.0257,
             0.6863,  0.0992,  1.4487,  0.0157};
  const int rows = 3;
  const int cols = 4;
  thrust::device_vector<mt> d_A(A, A+rows*cols);
  thrust::device_vector<it> idx(d_A.size());
  thrust::sequence(idx.begin(), idx.end());
  thrust::sort(idx.begin(), idx.end(), my_sort_functor(thrust::raw_pointer_cast(d_A.data()), cols));
  thrust::transform(idx.begin(), idx.end(), idx.begin(), _1%cols);
  thrust::host_vector<it> h_idx = idx;
  thrust::copy_n(h_idx.begin(), rows*cols, std::ostream_iterator<it>(std::cout, ","));
  std::cout << std::endl;
}
$ nvcc -o t114 t114.cu
$ ./t114
1,3,2,0,0,2,1,3,3,1,0,2,
$

I would normally lean towards the second approach as being more performant since it is moving only 1/3 the data. However it is also doing two integers divisions, so it may be a wash. We could precompute the row index in the second approach, at the expense of moving twice as much data during the sort, but avoiding the on-the-fly division ops.

If array dimensions were small enough (say row and column dimensions each less than 65536), we could even precompute the row index and column index, and encode row in the upper bits of the index, column in the lower bits, so as to only be moving a single index quantity. Even better:

$ cat t114.cu
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <iostream>

using namespace thrust::placeholders;
typedef float mt;
typedef unsigned it;

struct my_sort_functor
{
  mt *d;
  it cols;
  my_sort_functor(mt *_d, it _cols) : d(_d), cols(_cols) {};
  __host__ __device__
  bool operator()(const it &t1, const it &t2){
    it row1 = t1>>16;
    it row2 = t2>>16;
    if  (row1 < row2) return true;
    if  (row1 > row2) return false;
    it col1 = t1&65535;
    it col2 = t2&65535;
    it i1 = row1*cols+col1;
    it i2 = row2*cols+col2;
    if  (d[i1] < d[i2]) return true;
    return false;
  }
};

struct my_transform_functor
{
  it cols;
  my_transform_functor(it _cols) : cols(_cols) {};
  __host__ __device__
    it operator()(const it &t1){
      it row = t1/cols;
      it col = t1 - row*cols;
      return (row << 16) + col;
    }
};

int main(){

  mt A[] = { 3.4257, -1.2345,  0.6232, -0.1354,
            -1.6639,  0.1557, -0.1763,  1.0257,
             0.6863,  0.0992,  1.4487,  0.0157};
  // assume rows and cols are each less than 65536
  const int rows = 3;
  const int cols = 4;
  thrust::device_vector<mt> d_A(A, A+rows*cols);
  thrust::device_vector<it> idx(d_A.size());
  thrust::sequence(idx.begin(), idx.end());
  thrust::transform(idx.begin(), idx.end(), idx.begin(), my_transform_functor(cols));
  thrust::sort(idx.begin(), idx.end(), my_sort_functor(thrust::raw_pointer_cast(d_A.data()), cols));
  thrust::host_vector<it> h_idx = idx;
  for (int i = 0; i < rows*cols; i++) std::cout << (h_idx[i]&65535) << ",";
  std::cout << std::endl;
}
$ nvcc -o t114 t114.cu
$ ./t114
1,3,2,0,0,2,1,3,3,1,0,2,
$

Only moving one quantity, and no division on the fly.