c++cuda sparse-matrix linear-algebra cusolver

cuSOLVER - Device version of cusolverSpScsrlsvqr is much slower than host version

I have sparse 3-diagonal NxN matrix A built by some rule and want to solve the system Ax=b. For this I'm using cusolverSpScsrlsvqr() from cuSolverSp module. Is it ok to have device version many times slower than cusolverSpScsrlsvqrHost() for large N? E.g. for N=2^14 the times are 174.1 ms for device and 3.5 ms for host. I'm on RTX 2060.

Code:

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>

#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono> 


using namespace std;

void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
    char const *str = "UNKNOWN STATUS";
    switch (status) {
    case CUSOLVER_STATUS_SUCCESS:
        str = "CUSOLVER_STATUS_SUCCESS";
        break;
    case CUSOLVER_STATUS_NOT_INITIALIZED:
        str = "CUSOLVER_STATUS_NOT_INITIALIZED";
        break;
    case CUSOLVER_STATUS_ALLOC_FAILED:
        str = "CUSOLVER_STATUS_ALLOC_FAILED";
        break;
    case CUSOLVER_STATUS_INVALID_VALUE:
        str = "CUSOLVER_STATUS_INVALID_VALUE";
        break;
    case CUSOLVER_STATUS_ARCH_MISMATCH:
        str = "CUSOLVER_STATUS_ARCH_MISMATCH";
        break;
    case CUSOLVER_STATUS_MAPPING_ERROR:
        str = "CUSOLVER_STATUS_MAPPING_ERROR";
        break;
    case CUSOLVER_STATUS_EXECUTION_FAILED:
        str = "CUSOLVER_STATUS_EXECUTION_FAILED";
        break;
    case CUSOLVER_STATUS_INTERNAL_ERROR:
        str = "CUSOLVER_STATUS_INTERNAL_ERROR";
        break;
    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
        str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
        break;
    case CUSOLVER_STATUS_ZERO_PIVOT:
        str = "CUSOLVER_STATUS_ZERO_PIVOT";
        break;
    }
    cout << left << setw(30) << operation << " " << str << endl;
}

__global__ void fillAB(float *aValues, int *aRowPtrs, int *aColIdxs, float *b, int const n) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= n) return;
    if (i == 0) {
        float xn = 10 * (n + 1);
        aValues[n * 3] = xn;
        aRowPtrs[0] = 0;
        aRowPtrs[n + 1] = n * 3 + 1;
        aColIdxs[n * 3] = n;
        b[n] = xn * 2;
    }
    float xi = 10 * (i + 1);
    aValues[i * 3 + 0] = xi;
    aValues[i * 3 + 1] = xi + 5;
    aValues[i * 3 + 2] = xi - 5;
    aColIdxs[i * 3 + 0] = i;
    aColIdxs[i * 3 + 1] = i + 1;
    aColIdxs[i * 3 + 2] = i;
    aRowPtrs[i + 1] = 2 + (i * 3);
    b[i] = xi * 2;
}

int main() {
    int const n = (int)pow(2, 14);  // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
    int const valCount = n * 3 - 2;
    float *const aValues = new float[valCount];
    int *const aRowPtrs = new int[n + 1];
    int *const aColIdxs = new int[valCount];
    float *const b = new float[n];
    float *const x = new float[n];

    float *dev_aValues;
    int *dev_aRowPtrs;
    int *dev_aColIdxs;
    float *dev_b;
    float *dev_x;
    int aValuesSize = sizeof(float) * valCount;
    int aRowPtrsSize = sizeof(int) * (n + 1);
    int aColIdxsSize = sizeof(int) * valCount;
    int bSize = sizeof(float) * n;
    int xSize = sizeof(float) * n;
    cudaMalloc((void**)&dev_aValues, aValuesSize);
    cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
    cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
    cudaMalloc((void**)&dev_b, bSize);
    cudaMalloc((void**)&dev_x, xSize);
    fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
    cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);

    cusolverSpHandle_t handle;
    checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
    cusparseMatDescr_t aDescr;
    cusparseCreateMatDescr(&aDescr);
    cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
    cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
    int singularity;
    cusolverStatus_t status;
    cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
    cudaDeviceSynchronize();
    auto t0 = chrono::high_resolution_clock::now();
    for (int i = 0; i < 10; ++i)
        ////////////////////// CUSOLVER HERE //////////////////////
        status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
        //status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
        ///////////////////////////////////////////////////////////
    cudaDeviceSynchronize();
    auto t1 = chrono::high_resolution_clock::now();
    checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
    checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
    cout << "System solved: " << setw(20) << fixed << right << setprecision(3) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;

    cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
    /*for (int i = 0; i < n; ++i) {
        cout << " " << x[i];
    }*/
    cudaFree(dev_aValues);
    cudaFree(dev_aRowPtrs);
    cudaFree(dev_aColIdxs);
    cudaFree(dev_b);
    cudaFree(dev_x);
    delete[] aValues;
    delete[] aRowPtrs;
    delete[] aColIdxs;
    delete[] b;
    delete[] x;
    cudaDeviceReset();
    return 0;
}

Solution

My guess is the issue here is that it is a tridiagonal matrix. I suspect that may eliminate certain parallelism aspects that would be a benefit to the GPU cusolver routine. I don't really have any justification for this statement except that I read in the cusparse docs statements like this:

For example, a tridiagonal matrix has no parallelism.

I can't say what that means, exactly, except that it suggests to me that for a tridiagonal matrix, maybe a different approach is warranted. And cusparse provides solvers specifically for the tridiagonal case.

If we use one of those, we can get results that are faster than your specific host cusolver example, on your test case. Here is an example:

$ cat t48.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>

#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono>
#include <cassert>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
#ifdef USE_DOUBLE
#define START 3
#define TOL 0.000001
#define THR 0.00001
typedef double mt;
#else
#define START 0
#define TOL 0.01
#define THR 0.1
typedef float mt;
#endif


using namespace std;

void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
    char const *str = "UNKNOWN STATUS";
    switch (status) {
    case CUSOLVER_STATUS_SUCCESS:
        str = "CUSOLVER_STATUS_SUCCESS";
        break;
    case CUSOLVER_STATUS_NOT_INITIALIZED:
        str = "CUSOLVER_STATUS_NOT_INITIALIZED";
        break;
    case CUSOLVER_STATUS_ALLOC_FAILED:
        str = "CUSOLVER_STATUS_ALLOC_FAILED";
        break;
    case CUSOLVER_STATUS_INVALID_VALUE:
        str = "CUSOLVER_STATUS_INVALID_VALUE";
        break;
    case CUSOLVER_STATUS_ARCH_MISMATCH:
        str = "CUSOLVER_STATUS_ARCH_MISMATCH";
        break;
    case CUSOLVER_STATUS_MAPPING_ERROR:
        str = "CUSOLVER_STATUS_MAPPING_ERROR";
        break;
    case CUSOLVER_STATUS_EXECUTION_FAILED:
        str = "CUSOLVER_STATUS_EXECUTION_FAILED";
        break;
    case CUSOLVER_STATUS_INTERNAL_ERROR:
        str = "CUSOLVER_STATUS_INTERNAL_ERROR";
        break;
    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
        str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
        break;
    case CUSOLVER_STATUS_ZERO_PIVOT:
        str = "CUSOLVER_STATUS_ZERO_PIVOT";
        break;
    }
    cout << left << setw(30) << operation << " " << str << endl;
}

__global__ void fillAB(mt *aValues, int *aRowPtrs, int *aColIdxs, mt *b, int const n) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= n) return;
    if (i == 0) {
        mt xn = 10 * (n + 1);
        aValues[n * 3] = xn;
        aRowPtrs[0] = 0;
        aRowPtrs[n + 1] = n * 3 + 1;
        aColIdxs[n * 3] = n;
        b[n] = xn * 2;
    }
    mt xi = 10 * (i + 1);
    aValues[i * 3 + 0] = xi;
    aValues[i * 3 + 1] = xi + 5;
    aValues[i * 3 + 2] = xi - 5;
    aColIdxs[i * 3 + 0] = i;
    aColIdxs[i * 3 + 1] = i + 1;
    aColIdxs[i * 3 + 2] = i;
    aRowPtrs[i + 1] = 2 + (i * 3);
    b[i] = xi * 2;
}
__global__ void filld3(mt *d, mt *du, mt *dl, mt *aValues, mt *b, mt *b2, const int n){
        int i = blockDim.x*blockIdx.x+threadIdx.x;
        if ((i > 0) && (i < n-1)){
                dl[i] = aValues[i*3 - 1];
                d[i] = aValues[i*3];
                du[i] = aValues[i*3+1];
        }
        if (i == 0){
                dl[0] = 0;
                d[0]  = aValues[0];
                du[0] = aValues[1];}
        if (i == (n-1)){
                dl[i] = aValues[i*3-1];
                d[i]  = aValues[i*3];
                du[i] = 0;}
        if (i < n) b2[i] = b[i];
}

int main() {
    int const n = (int)pow(2, 14);  // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
    int const valCount = n * 3 - 2;
    mt *const aValues = new mt[valCount];
    int *const aRowPtrs = new int[n + 1];
    int *const aColIdxs = new int[valCount];
    mt *const b = new mt[n];
    mt *const x = new mt[n];
    mt *const x2= new mt[n];

    mt *dev_aValues;
    int *dev_aRowPtrs;
    int *dev_aColIdxs;
    mt *dev_b;
    mt *dev_x;
    mt *dev_b2, *dev_d, *dev_dl, *dev_du;
    int aValuesSize = sizeof(mt) * valCount;
    int aRowPtrsSize = sizeof(int) * (n + 1);
    int aColIdxsSize = sizeof(int) * valCount;
    int bSize = sizeof(mt) * n;
    int xSize = sizeof(mt) * n;
    cudaMalloc((void**)&dev_aValues, aValuesSize);
    cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
    cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
    cudaMalloc((void**)&dev_b, bSize);
    cudaMalloc((void**)&dev_x, xSize);
    cudaMalloc((void**)&dev_b2, bSize);
    cudaMalloc(&dev_d,  n*sizeof(mt));
    cudaMalloc(&dev_dl, n*sizeof(mt));
    cudaMalloc(&dev_du, n*sizeof(mt));
    fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
    filld3<<<(n+1023)/1024,1024>>>(dev_d, dev_du, dev_dl, dev_aValues, dev_b, dev_b2, n);
    cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);

    cusolverSpHandle_t handle;
    checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
    cusparseMatDescr_t aDescr;
    cusparseCreateMatDescr(&aDescr);
    cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
    cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
    int singularity;
    cusolverStatus_t status;
    unsigned long long dt = dtime_usec(0);
#ifdef USE_DOUBLE
    cusolverSpDcsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#else
    cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#endif
    cudaDeviceSynchronize();
    dt = dtime_usec(dt);
    std::cout << "time: " << dt/(float)USECPSEC << "s" << std::endl;
    auto t0 = chrono::high_resolution_clock::now();
    for (int i = 0; i < 10; ++i)
        ////////////////////// CUSOLVER HERE //////////////////////
#ifdef USE_DEVICE
#ifdef USE_DOUBLE
        status = cusolverSpDcsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#else
        status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#endif
#else
#ifdef USE_DOUBLE
        status = cusolverSpDcsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
#else
        status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
#endif
#endif
    ///////////////////////////////////////////////////////////
    cudaDeviceSynchronize();
    auto t1 = chrono::high_resolution_clock::now();
    checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
    checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
    cout << "System solved: " << setw(20) << fixed << right << setprecision(6) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;

    cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
    /*for (int i = 0; i < n; ++i) {
        cout << " " << x[i];
    }*/
    cusparseHandle_t csphandle;
    cusparseStatus_t  cstat = cusparseCreate(&csphandle);
    assert(cstat == CUSPARSE_STATUS_SUCCESS);
    size_t bufferSize;
#ifdef USE_DOUBLE
    cstat = cusparseDgtsv2_nopivot_bufferSizeExt(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, &bufferSize);
#else
    cstat = cusparseSgtsv2_nopivot_bufferSizeExt(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, &bufferSize);
#endif
    assert(cstat == CUSPARSE_STATUS_SUCCESS);
    unsigned char *dev_buffer;
    cudaMalloc(&dev_buffer, bufferSize);
    dt = dtime_usec(0);
#ifdef USE_DOUBLE
    cstat = cusparseDgtsv2_nopivot(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, (void *)dev_buffer);
#else
    cstat = cusparseSgtsv2_nopivot(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, (void *)dev_buffer);
#endif
    if(cstat != CUSPARSE_STATUS_SUCCESS) {std::cout << "cusparse solve error: " << (int)cstat  << std::endl;}
    cudaDeviceSynchronize();
    dt = dtime_usec(dt);
    std::cout << "cusparse time: " << (dt*1000.f)/(float)USECPSEC << "ms" << std::endl;
    std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
    cudaMemcpy(x2, dev_b2, xSize, cudaMemcpyDeviceToHost);
    for (int i = START; i < n; i++) if ((x[i] > THR) && (fabs((x[i] - x2[i])/x[i]) > TOL)) {std::cout << "mismatch at: " << i << " was: " << x2[i] << " should be: " << x[i] << std::endl; return 0;}

    for (int i = 0; i < 40; i++)
            std::cout << x2[i] << "    " << x[i] <<  std::endl;
    cudaFree(dev_aValues);
    cudaFree(dev_aRowPtrs);
    cudaFree(dev_aColIdxs);
    cudaFree(dev_b);
    cudaFree(dev_x);
    delete[] aValues;
    delete[] aRowPtrs;
    delete[] aColIdxs;
    delete[] b;
    delete[] x;
    cudaDeviceReset();
    return 0;
}
$ nvcc -o t48 t48.cu -lcusparse -lcusolver
$ ./t48
cusolverSpCreate               CUSOLVER_STATUS_SUCCESS
time: 0.202933s
cusolverSpScsrlsvqr            CUSOLVER_STATUS_SUCCESS
cusolverSpDestroy              CUSOLVER_STATUS_SUCCESS
System solved:             6.653404 ms
cusparse time: 0.089000ms
no error
-11243.155273    -11242.705078
7496.770508    7496.473145
-3747.185303    -3747.039551
0.685791    0.689445
2083.059570    2082.854004
-1892.308716    -1892.124756
306.474457    306.447662
1103.516846    1103.407104
-1271.085938    -1270.961060
334.883911    334.852417
711.941956    711.870911
-955.718140    -955.624390
321.378174    321.348175
506.580902    506.530060
-764.231689    -764.156799
300.298950    300.270935
382.335785    382.299347
-635.448120    -635.388000
279.217651    279.191864
300.164459    300.135559
-542.869019    -542.817688
259.955475    259.931702
242.390839    242.367310
-473.107758    -473.063080
242.806229    242.785751
199.916733    199.895645
-418.663696    -418.624115
227.637909    227.618652
167.604431    167.586487
-375.002411    -374.966827
214.208069    214.189835
142.353058    142.337738
-339.221130    -339.187653
202.273911    202.255341
122.184746    122.171494
-309.370209    -309.339600
191.615189    191.597580
105.783485    105.771858
-284.096802    -284.068604
182.047958    182.031158
$

Notes:

no claims of correctness or suitability here. It's mostly your code, that I modified a bit to investigate.
The results between the methods don't match exactly, but in the float case appear to be within about 1% of each other. I think some of this is due to float resolution, but there may be other factors. Without further study I wouldn't have any reason to claim one is "more correct" than the other.
I used the nopivot variant of gtsv2 because it seemed to suggest that it would be faster in the power-of-2 size case, which is your case. And according to my testing it was faster.
When I run the nopivot case on a 2^12 size instead of 2^14, it certainly does run faster on my GPU (GTX 960). YMMV.
I dropped a variety of other junk in the code as I was investigating various things, so it's a bit messy.
Again, I really can't explain the cusolver case. The speculation around the tridiagonal problem nature is just that - speculation. Nevertheless it seems to me likely that if the cusparse developers found a good reason to provide a (separate) set of solvers for the tridiagonal case, there may be some sensible reason to do that (i.e. some aspect of the problem that can be exploited, when that information is known a priori). So it seems reasonable to use them, and in this case it seems to run faster.