I want to test the cusparseScsr2csc
which is a function used to convert a csr format matrix to a csc format matrix (or just say transpose a csr format matrix), so I write the code below to test it.
the wrapper:
CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
cusparseHandle_t handle;
cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
cusparseCreate(&handle);
// malloc space on video card and copy data
float *csr_values;
int *csr_row_ptrs;
int *csr_col_inds;
float *csc_values;
int *csc_col_ptrs;
int *csc_row_inds;
cudaMalloc(&csr_values, sizeof(float) * nnz);
cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
cudaMalloc(&csc_values, sizeof(float) * nnz);
cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
// use the API from cuSPARSE
st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
csr_col_inds, csc_values, csc_row_inds,
csc_col_ptrs, copyValues, idxBase);
// copy the data from device (video card) to host (CPU)
vector<float> res_values;
vector<int> res_row_ptrs, res_col_inds;
res_row_ptrs.resize(n + 1);
res_col_inds.resize(nnz);
res_values.resize(nnz);
cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
// return the answer
return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}
this is the CSR class:
template<class T>
struct CSR {
vector<T> values;
vector<int> row_ptrs;
vector<int> col_inds;
CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
void out() {
cout << "valuse = ";
for (auto &t : values) cout << t << ' ';
cout << "\nrow_ptrs = ";
for (auto &t : row_ptrs) cout << t << ' ';
cout << "\ncol_inds = ";
for (auto &t : col_inds) cout << t << ' ';
cout << endl;
}
};
and this is the code in the main:
int m = 4, n = 6, nnz = 8;
float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };
cusparseStatus_t st;
auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);
res.out();
the CSR format in the main is derived from the matrix below which I want to transpose (A <=> values, IA <=> row_ptrs, JA <=> col_inds):
the result I got (definitely wrong):
My video card is Geforce MX150, and I use Visual Studio 15 2017 with CUDA 9.2
Full code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include <cusparse.h>
#include <iostream>
#include <vector>
#include <complex>
using namespace std;
template<class T>
struct CSR {
vector<T> values;
vector<int> row_ptrs;
vector<int> col_inds;
CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
void out() {
cout << "valuse = ";
for (auto &t : values) cout << t << ' ';
cout << "\nrow_ptrs = ";
for (auto &t : row_ptrs) cout << t << ' ';
cout << "\ncol_inds = ";
for (auto &t : col_inds) cout << t << ' ';
cout << endl;
}
};
CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
cusparseHandle_t handle;
cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
cusparseCreate(&handle);
float *csr_values;
int *csr_row_ptrs;
int *csr_col_inds;
float *csc_values;
int *csc_col_ptrs;
int *csc_row_inds;
cudaMalloc(&csr_values, sizeof(float) * nnz);
cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
cudaMalloc(&csc_values, sizeof(float) * nnz);
cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
csr_col_inds, csc_values, csc_row_inds,
csc_col_ptrs, copyValues, idxBase);
vector<float> res_values;
vector<int> res_row_ptrs, res_col_inds;
res_row_ptrs.resize(n + 1);
res_col_inds.resize(nnz);
res_values.resize(nnz);
cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}
int main()
{
int m = 4, n = 6, nnz = 8;
float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };
cusparseStatus_t st;
auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);
if (st == CUSPARSE_STATUS_SUCCESS) {
cout << "success" << endl;
res.out();
}
return 0;
}
this is the documentation page, the function cusparseScsr2csc
is located in ch. 9.
And I find the text below, it says the function executes asynchronously, maybe this is the problem, but I still don't know how to deal with it.
I tried the solution mentioned by paleonix (add cudaDeviceSynchronize()
right after the cusparseScsr2csc(...)
), but still got the exact same wrong answer.
The main problem is here:
cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
That should be:
cudaMemcpy(csr_col_inds, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
A few other notes:
The function cusparseScsr2csc
is no longer available in recent versions of CUDA (it was evidently deprecated and later removed). I assume this might be one reason you are using CUDA 9.2. One possible replacement would be cusparseCsr2cscEx2()
There is no particular need for an additional cudaDeviceSynchronize()
here. The cudaMemcpy
functions after the cusparse
function call serve the same purpose.