Search code examples
c++cudacusparse

How to get the diagonal of a sparse matrix in cuSparse?


I have a sparse matrix in cuSparse and I want to extract the diagonal. I can't seem to find a way to do it other than converting it back to CPU memory into Eigen SparseMatrix and use the .diagonal provided by Eigen to do it, and then copy the result back to GPU. Obviously this is pretty inefficient so I am wondering if there's a way to do it directly in the GPU. Please see below code for reference:

void CuSparseTransposeToEigenSparse(
    const int *d_row,
    const int *d_col,
    const double *d_val,
    const int num_non0,
    const int mat_row,
    const int mat_col,
    Eigen::SparseMatrix<double> &mat){
  std::vector<int> outer(mat_col + 1);
  std::vector<int> inner(num_non0);
  std::vector<double> value(num_non0);

  cudaMemcpy(
      outer.data(), d_row, sizeof(int) * (mat_col + 1), cudaMemcpyDeviceToHost);

  cudaMemcpy(
      inner.data(), d_col, sizeof(int) * num_non0, cudaMemcpyDeviceToHost);

  cudaMemcpy(
      value.data(), d_val, sizeof(double) * num_non0, cudaMemcpyDeviceToHost);

  Eigen::Map<Eigen::SparseMatrix<double>> mat_map(
      mat_row, mat_col, num_non0, outer.data(), inner.data(), value.data());

  mat = mat_map.eval();
}

int main(){

  int *d_A_row;
  int *d_A_col;
  double *d_A_val;
  int A_len;
  int num_A_non0;
  double *d_A_diag;

  // these values are filled with some computation

  // current solution
  Eigen::SparseMatrix<double> A;

  CuSparseTransposeToEigenSparse(
      d_A_row, d_A_col, d_A_val, num_A_non0, A_len, A_len, A);

  Eigen::VectorXd A_diag = A.diagonal();

  cudaMemcpy(d_A_diag, A_diag.data(), sizeof(double) * A_len, cudaMemcpyHostToDevice);

  // is there a way to fill in d_A_diag without copying back to CPU?

  return 0;
}


Solution

  • Just in case anyone is interested. I figured it out for the case of a CSR matrix. The custom kernel to do it looks like this:

    __global__ static void GetDiagFromSparseMat(const int *A_row,
                                                const int *A_col,
                                                const double *A_val,
                                                const int A_len,
                                                double *A_diag){
      const int x = blockIdx.x * blockDim.x + threadIdx.x;
    
      if (x < A_len){
        const int num_non0_row = A_row[x + 1] - A_row[x];
    
        A_diag[x] = 0.0;
    
        for (int i = 0; i < num_non0_row; i++){
          if (A_col[i + A_row[x]] == x){
            A_diag[x] = A_val[i + A_row[x]];
            break;
          }
        }
      }
    }