How to use cusparseXcoo2csr in cuSparse to convert from coo to csc?

In the documentation of cuSparse, it stated that the function cusparseXcoo2csr

can also be used to convert the array containing the uncompressed column indices (corresponding to COO format) into an array of column pointers (corresponding to CSC format)

however, I could not find a way to reproduce it. Please see below minimal code:

CMakeLists.txt

cmake_minimum_required(VERSION 3.11)

project(sample)

find_package(CUDA REQUIRED)

add_executable(${PROJECT_NAME} main.cpp)

target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_14)

target_include_directories(${PROJECT_NAME} SYSTEM PUBLIC ${CUDA_INCLUDE_DIRS})

target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY})

main.cpp

#include <iostream>
#include <vector>

#include <cuda_runtime_api.h>
#include <cusparse_v2.h>

int main(){
  // using the matrix as shown in https://docs.nvidia.com/cuda/cusparse/index.html#coo-format
  // 1 4 0 0 0
  // 0 2 3 0 0
  // 5 0 0 7 8
  // 0 0 9 0 6

  std::vector<int> row;
  std::vector<int> col;
  std::vector<double> val;

  row.emplace_back(0);
  row.emplace_back(0);
  row.emplace_back(1);
  row.emplace_back(1);
  row.emplace_back(2);
  row.emplace_back(2);
  row.emplace_back(2);
  row.emplace_back(3);
  row.emplace_back(3);

  col.emplace_back(0);
  col.emplace_back(1);
  col.emplace_back(1);
  col.emplace_back(2);
  col.emplace_back(0);
  col.emplace_back(3);
  col.emplace_back(4);
  col.emplace_back(2);
  col.emplace_back(4);

  val.emplace_back(1);
  val.emplace_back(4);
  val.emplace_back(2);
  val.emplace_back(3);
  val.emplace_back(5);
  val.emplace_back(7);
  val.emplace_back(8);
  val.emplace_back(9);
  val.emplace_back(6);

  int *d_row;
  int *d_col;
  double *d_val;

  cudaMalloc(reinterpret_cast<void **>(&d_row), row.size() * sizeof(int));
  cudaMalloc(reinterpret_cast<void **>(&d_col), col.size() * sizeof(int));
  cudaMalloc(reinterpret_cast<void **>(&d_val), val.size() * sizeof(double));

  cudaMemcpy(d_row, row.data(), sizeof(int) * row.size(), cudaMemcpyHostToDevice);
  cudaMemcpy(d_col, col.data(), sizeof(int) * col.size(), cudaMemcpyHostToDevice);
  cudaMemcpy(d_val, val.data(), sizeof(double) * val.size(), cudaMemcpyHostToDevice);

  cusparseHandle_t handle;
  cusparseCreate(&handle);

  cusparseMatDescr_t descr;
  cusparseCreateMatDescr(&descr);
  cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
  cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);

  cusparseMatDescr_t descr_out;
  cusparseCreateMatDescr(&descr_out);
  cusparseSetMatType(descr_out, CUSPARSE_MATRIX_TYPE_GENERAL);
  cusparseSetMatIndexBase(descr_out, CUSPARSE_INDEX_BASE_ZERO);

  int *d_row_csr;

  cudaMalloc(reinterpret_cast<void **>(&d_row_csr), (4 + 1) * sizeof(int));

  cusparseXcoo2csr(handle, d_row, 9, 4, d_row_csr, CUSPARSE_INDEX_BASE_ZERO);

  std::vector<int> row_csr(4 + 1);

  cudaMemcpy(row_csr.data(), d_row_csr, sizeof(int) * (4 + 1), cudaMemcpyDeviceToHost);

  std::cout << "row" << std::endl;
  for (int i : row_csr){
    std::cout << i << std::endl; // prints 0 2 4 7 9 as expected
  }

  // however when I try to compress the column the same way...
  int *d_col_csc;

  cudaMalloc(reinterpret_cast<void **>(&d_col_csc), (5 + 1) * sizeof(int));

  cusparseXcoo2csr(handle, d_col, 9, 5, d_col_csc, CUSPARSE_INDEX_BASE_ZERO);

  std::vector<int> col_csc(5 + 1);

  cudaMemcpy(col_csc.data(), d_col_csc, sizeof(int) * (5 + 1), cudaMemcpyDeviceToHost);

  std::cout << "col" << std::endl;
  for (int i : col_csc){
    std::cout << i << std::endl; // prints 0 5 3 8 6 9, shouldn't it be 0 2 4 6 7 9?
  }

  return 0;
}

As you can see for some reason, the conversion from coo to csc is not right for some reason. I temporarily worked around this problem by calling cusparseXcoo2csr to do coo to csr conversion, and then call another cusparseDcsr2csc to convert the csr intermediate result to csc. That's extra computation so I would like to know how to use cusparseXcoo2csr to directly convert coo to csc as indicated in the documentation.

Solution

Row compression of COO row coordinates to row pointers and column compression of COO column coordinates to column pointers are fundamentally the same operation (basically just a prefix sum by key). Although it is not explicitly documented, cusparseXcoo2csr requires the input coordinate data to be sorted.

In your example, the CSR conversion works because the coordinates you have are sorted in row order, and and the CSC fails because the column coordinates are not sorted in column order. If you reorder the inputs so that the column indices are sorted, the conversion will work.