I have sparse 3-diagonal NxN matrix A
built by some rule and want to solve the system Ax=b
. For this I'm using cusolverSpScsrlsvqr()
from cuSolverSp module. Is it ok to have device version many times slower than cusolverSpScsrlsvqrHost()
for large N? E.g. for N=2^14 the times are 174.1 ms for device and 3.5 ms for host. I'm on RTX 2060.
Code:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono>
using namespace std;
void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
char const *str = "UNKNOWN STATUS";
switch (status) {
case CUSOLVER_STATUS_SUCCESS:
str = "CUSOLVER_STATUS_SUCCESS";
break;
case CUSOLVER_STATUS_NOT_INITIALIZED:
str = "CUSOLVER_STATUS_NOT_INITIALIZED";
break;
case CUSOLVER_STATUS_ALLOC_FAILED:
str = "CUSOLVER_STATUS_ALLOC_FAILED";
break;
case CUSOLVER_STATUS_INVALID_VALUE:
str = "CUSOLVER_STATUS_INVALID_VALUE";
break;
case CUSOLVER_STATUS_ARCH_MISMATCH:
str = "CUSOLVER_STATUS_ARCH_MISMATCH";
break;
case CUSOLVER_STATUS_MAPPING_ERROR:
str = "CUSOLVER_STATUS_MAPPING_ERROR";
break;
case CUSOLVER_STATUS_EXECUTION_FAILED:
str = "CUSOLVER_STATUS_EXECUTION_FAILED";
break;
case CUSOLVER_STATUS_INTERNAL_ERROR:
str = "CUSOLVER_STATUS_INTERNAL_ERROR";
break;
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
break;
case CUSOLVER_STATUS_ZERO_PIVOT:
str = "CUSOLVER_STATUS_ZERO_PIVOT";
break;
}
cout << left << setw(30) << operation << " " << str << endl;
}
__global__ void fillAB(float *aValues, int *aRowPtrs, int *aColIdxs, float *b, int const n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= n) return;
if (i == 0) {
float xn = 10 * (n + 1);
aValues[n * 3] = xn;
aRowPtrs[0] = 0;
aRowPtrs[n + 1] = n * 3 + 1;
aColIdxs[n * 3] = n;
b[n] = xn * 2;
}
float xi = 10 * (i + 1);
aValues[i * 3 + 0] = xi;
aValues[i * 3 + 1] = xi + 5;
aValues[i * 3 + 2] = xi - 5;
aColIdxs[i * 3 + 0] = i;
aColIdxs[i * 3 + 1] = i + 1;
aColIdxs[i * 3 + 2] = i;
aRowPtrs[i + 1] = 2 + (i * 3);
b[i] = xi * 2;
}
int main() {
int const n = (int)pow(2, 14); // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
int const valCount = n * 3 - 2;
float *const aValues = new float[valCount];
int *const aRowPtrs = new int[n + 1];
int *const aColIdxs = new int[valCount];
float *const b = new float[n];
float *const x = new float[n];
float *dev_aValues;
int *dev_aRowPtrs;
int *dev_aColIdxs;
float *dev_b;
float *dev_x;
int aValuesSize = sizeof(float) * valCount;
int aRowPtrsSize = sizeof(int) * (n + 1);
int aColIdxsSize = sizeof(int) * valCount;
int bSize = sizeof(float) * n;
int xSize = sizeof(float) * n;
cudaMalloc((void**)&dev_aValues, aValuesSize);
cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
cudaMalloc((void**)&dev_b, bSize);
cudaMalloc((void**)&dev_x, xSize);
fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);
cusolverSpHandle_t handle;
checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
cusparseMatDescr_t aDescr;
cusparseCreateMatDescr(&aDescr);
cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
int singularity;
cusolverStatus_t status;
cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
cudaDeviceSynchronize();
auto t0 = chrono::high_resolution_clock::now();
for (int i = 0; i < 10; ++i)
////////////////////// CUSOLVER HERE //////////////////////
status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
//status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
///////////////////////////////////////////////////////////
cudaDeviceSynchronize();
auto t1 = chrono::high_resolution_clock::now();
checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
cout << "System solved: " << setw(20) << fixed << right << setprecision(3) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;
cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
/*for (int i = 0; i < n; ++i) {
cout << " " << x[i];
}*/
cudaFree(dev_aValues);
cudaFree(dev_aRowPtrs);
cudaFree(dev_aColIdxs);
cudaFree(dev_b);
cudaFree(dev_x);
delete[] aValues;
delete[] aRowPtrs;
delete[] aColIdxs;
delete[] b;
delete[] x;
cudaDeviceReset();
return 0;
}
My guess is the issue here is that it is a tridiagonal matrix. I suspect that may eliminate certain parallelism aspects that would be a benefit to the GPU cusolver routine. I don't really have any justification for this statement except that I read in the cusparse docs statements like this:
For example, a tridiagonal matrix has no parallelism.
I can't say what that means, exactly, except that it suggests to me that for a tridiagonal matrix, maybe a different approach is warranted. And cusparse provides solvers specifically for the tridiagonal case.
If we use one of those, we can get results that are faster than your specific host cusolver example, on your test case. Here is an example:
$ cat t48.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono>
#include <cassert>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
#ifdef USE_DOUBLE
#define START 3
#define TOL 0.000001
#define THR 0.00001
typedef double mt;
#else
#define START 0
#define TOL 0.01
#define THR 0.1
typedef float mt;
#endif
using namespace std;
void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
char const *str = "UNKNOWN STATUS";
switch (status) {
case CUSOLVER_STATUS_SUCCESS:
str = "CUSOLVER_STATUS_SUCCESS";
break;
case CUSOLVER_STATUS_NOT_INITIALIZED:
str = "CUSOLVER_STATUS_NOT_INITIALIZED";
break;
case CUSOLVER_STATUS_ALLOC_FAILED:
str = "CUSOLVER_STATUS_ALLOC_FAILED";
break;
case CUSOLVER_STATUS_INVALID_VALUE:
str = "CUSOLVER_STATUS_INVALID_VALUE";
break;
case CUSOLVER_STATUS_ARCH_MISMATCH:
str = "CUSOLVER_STATUS_ARCH_MISMATCH";
break;
case CUSOLVER_STATUS_MAPPING_ERROR:
str = "CUSOLVER_STATUS_MAPPING_ERROR";
break;
case CUSOLVER_STATUS_EXECUTION_FAILED:
str = "CUSOLVER_STATUS_EXECUTION_FAILED";
break;
case CUSOLVER_STATUS_INTERNAL_ERROR:
str = "CUSOLVER_STATUS_INTERNAL_ERROR";
break;
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
break;
case CUSOLVER_STATUS_ZERO_PIVOT:
str = "CUSOLVER_STATUS_ZERO_PIVOT";
break;
}
cout << left << setw(30) << operation << " " << str << endl;
}
__global__ void fillAB(mt *aValues, int *aRowPtrs, int *aColIdxs, mt *b, int const n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= n) return;
if (i == 0) {
mt xn = 10 * (n + 1);
aValues[n * 3] = xn;
aRowPtrs[0] = 0;
aRowPtrs[n + 1] = n * 3 + 1;
aColIdxs[n * 3] = n;
b[n] = xn * 2;
}
mt xi = 10 * (i + 1);
aValues[i * 3 + 0] = xi;
aValues[i * 3 + 1] = xi + 5;
aValues[i * 3 + 2] = xi - 5;
aColIdxs[i * 3 + 0] = i;
aColIdxs[i * 3 + 1] = i + 1;
aColIdxs[i * 3 + 2] = i;
aRowPtrs[i + 1] = 2 + (i * 3);
b[i] = xi * 2;
}
__global__ void filld3(mt *d, mt *du, mt *dl, mt *aValues, mt *b, mt *b2, const int n){
int i = blockDim.x*blockIdx.x+threadIdx.x;
if ((i > 0) && (i < n-1)){
dl[i] = aValues[i*3 - 1];
d[i] = aValues[i*3];
du[i] = aValues[i*3+1];
}
if (i == 0){
dl[0] = 0;
d[0] = aValues[0];
du[0] = aValues[1];}
if (i == (n-1)){
dl[i] = aValues[i*3-1];
d[i] = aValues[i*3];
du[i] = 0;}
if (i < n) b2[i] = b[i];
}
int main() {
int const n = (int)pow(2, 14); // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
int const valCount = n * 3 - 2;
mt *const aValues = new mt[valCount];
int *const aRowPtrs = new int[n + 1];
int *const aColIdxs = new int[valCount];
mt *const b = new mt[n];
mt *const x = new mt[n];
mt *const x2= new mt[n];
mt *dev_aValues;
int *dev_aRowPtrs;
int *dev_aColIdxs;
mt *dev_b;
mt *dev_x;
mt *dev_b2, *dev_d, *dev_dl, *dev_du;
int aValuesSize = sizeof(mt) * valCount;
int aRowPtrsSize = sizeof(int) * (n + 1);
int aColIdxsSize = sizeof(int) * valCount;
int bSize = sizeof(mt) * n;
int xSize = sizeof(mt) * n;
cudaMalloc((void**)&dev_aValues, aValuesSize);
cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
cudaMalloc((void**)&dev_b, bSize);
cudaMalloc((void**)&dev_x, xSize);
cudaMalloc((void**)&dev_b2, bSize);
cudaMalloc(&dev_d, n*sizeof(mt));
cudaMalloc(&dev_dl, n*sizeof(mt));
cudaMalloc(&dev_du, n*sizeof(mt));
fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
filld3<<<(n+1023)/1024,1024>>>(dev_d, dev_du, dev_dl, dev_aValues, dev_b, dev_b2, n);
cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);
cusolverSpHandle_t handle;
checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
cusparseMatDescr_t aDescr;
cusparseCreateMatDescr(&aDescr);
cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
int singularity;
cusolverStatus_t status;
unsigned long long dt = dtime_usec(0);
#ifdef USE_DOUBLE
cusolverSpDcsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#else
cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#endif
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "time: " << dt/(float)USECPSEC << "s" << std::endl;
auto t0 = chrono::high_resolution_clock::now();
for (int i = 0; i < 10; ++i)
////////////////////// CUSOLVER HERE //////////////////////
#ifdef USE_DEVICE
#ifdef USE_DOUBLE
status = cusolverSpDcsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#else
status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#endif
#else
#ifdef USE_DOUBLE
status = cusolverSpDcsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
#else
status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
#endif
#endif
///////////////////////////////////////////////////////////
cudaDeviceSynchronize();
auto t1 = chrono::high_resolution_clock::now();
checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
cout << "System solved: " << setw(20) << fixed << right << setprecision(6) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;
cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
/*for (int i = 0; i < n; ++i) {
cout << " " << x[i];
}*/
cusparseHandle_t csphandle;
cusparseStatus_t cstat = cusparseCreate(&csphandle);
assert(cstat == CUSPARSE_STATUS_SUCCESS);
size_t bufferSize;
#ifdef USE_DOUBLE
cstat = cusparseDgtsv2_nopivot_bufferSizeExt(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, &bufferSize);
#else
cstat = cusparseSgtsv2_nopivot_bufferSizeExt(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, &bufferSize);
#endif
assert(cstat == CUSPARSE_STATUS_SUCCESS);
unsigned char *dev_buffer;
cudaMalloc(&dev_buffer, bufferSize);
dt = dtime_usec(0);
#ifdef USE_DOUBLE
cstat = cusparseDgtsv2_nopivot(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, (void *)dev_buffer);
#else
cstat = cusparseSgtsv2_nopivot(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, (void *)dev_buffer);
#endif
if(cstat != CUSPARSE_STATUS_SUCCESS) {std::cout << "cusparse solve error: " << (int)cstat << std::endl;}
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "cusparse time: " << (dt*1000.f)/(float)USECPSEC << "ms" << std::endl;
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
cudaMemcpy(x2, dev_b2, xSize, cudaMemcpyDeviceToHost);
for (int i = START; i < n; i++) if ((x[i] > THR) && (fabs((x[i] - x2[i])/x[i]) > TOL)) {std::cout << "mismatch at: " << i << " was: " << x2[i] << " should be: " << x[i] << std::endl; return 0;}
for (int i = 0; i < 40; i++)
std::cout << x2[i] << " " << x[i] << std::endl;
cudaFree(dev_aValues);
cudaFree(dev_aRowPtrs);
cudaFree(dev_aColIdxs);
cudaFree(dev_b);
cudaFree(dev_x);
delete[] aValues;
delete[] aRowPtrs;
delete[] aColIdxs;
delete[] b;
delete[] x;
cudaDeviceReset();
return 0;
}
$ nvcc -o t48 t48.cu -lcusparse -lcusolver
$ ./t48
cusolverSpCreate CUSOLVER_STATUS_SUCCESS
time: 0.202933s
cusolverSpScsrlsvqr CUSOLVER_STATUS_SUCCESS
cusolverSpDestroy CUSOLVER_STATUS_SUCCESS
System solved: 6.653404 ms
cusparse time: 0.089000ms
no error
-11243.155273 -11242.705078
7496.770508 7496.473145
-3747.185303 -3747.039551
0.685791 0.689445
2083.059570 2082.854004
-1892.308716 -1892.124756
306.474457 306.447662
1103.516846 1103.407104
-1271.085938 -1270.961060
334.883911 334.852417
711.941956 711.870911
-955.718140 -955.624390
321.378174 321.348175
506.580902 506.530060
-764.231689 -764.156799
300.298950 300.270935
382.335785 382.299347
-635.448120 -635.388000
279.217651 279.191864
300.164459 300.135559
-542.869019 -542.817688
259.955475 259.931702
242.390839 242.367310
-473.107758 -473.063080
242.806229 242.785751
199.916733 199.895645
-418.663696 -418.624115
227.637909 227.618652
167.604431 167.586487
-375.002411 -374.966827
214.208069 214.189835
142.353058 142.337738
-339.221130 -339.187653
202.273911 202.255341
122.184746 122.171494
-309.370209 -309.339600
191.615189 191.597580
105.783485 105.771858
-284.096802 -284.068604
182.047958 182.031158
$
Notes:
float
case appear to be within about 1% of each other. I think some of this is due to float
resolution, but there may be other factors. Without further study I wouldn't have any reason to claim one is "more correct" than the other.nopivot
variant of gtsv2
because it seemed to suggest that it would be faster in the power-of-2 size case, which is your case. And according to my testing it was faster.