I am testing out some scenarios where the function dgetrf
is returned differently when used with cuBLAS/cuSOLVER
compared to writing for LAPACK
. For example, I am looking at LU factorization of the following matrix:
2.0 4.0 1.0 -3.0 0.0
-1.0 -2.0 2.0 4.0 0.0
4.0 2.0 -3.0 5.0 0.0
5.0 -4.0 -3.0 1.0 0.0
0.0 0.0 0.0 0.0 0.0
I first try to call dgetrf
from cuBLAS/cuSOLVER
as followed (warning, ugly testing code ahead!)
#include <cblas.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main(int argc, char** argv){
const int matrixSize = 5;
int i, j;
double arrA[matrixSize][matrixSize] = {
{2.0, 4.0, 1.0, -3.0, 0.0},
{-1.0, -2.0, 2.0, 4.0, 0.0},
{4.0, 2.0, -3.0, 5.0, 0.0},
{5.0, -4.0, -3.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 0.0, 0.0}
};
double *arrADev, *workArray;
double **matrixArray;
int *pivotArray;
int *infoArray;
double flat[matrixSize*matrixSize] = {0};
cublasHandle_t cublasHandle;
cublasStatus_t cublasStatus;
cudaError_t error;
cudaError cudaStatus;
cusolverStatus_t cusolverStatus;
cusolverDnHandle_t cusolverHandle;
double *matrices[2];
error = cudaMalloc(&arrADev, sizeof(double) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&matrixArray, sizeof(double*) * 2);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&pivotArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&infoArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
cublasStatus = cublasCreate(&cublasHandle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//maps matrix to flat vector
for(i=0; i<matrixSize; i++){
for(j=0; j<matrixSize; j++){
flat[i+j*matrixSize] = arrA[i][j];
}
}
//copy matrix A to device
cublasStatus = cublasSetMatrix(matrixSize, matrixSize, sizeof(double), flat, matrixSize, arrADev, matrixSize);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//save matrix address
matrices[0] = arrADev;
//copy matrices references to device
error = cudaMemcpy(matrixArray, matrices, sizeof(double*)*1, cudaMemcpyHostToDevice);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
int Lwork;
// calculate buffer size for cuSOLVER LU factorization
cusolverStatus = cusolverDnDgetrf_bufferSize(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, &Lwork);
cudaStatus = cudaMalloc((void**)&workArray, Lwork*sizeof(double));
// cuBLAS LU factorization
cublasStatus = cublasDgetrfBatched(cublasHandle, matrixSize, matrixArray, matrixSize, pivotArray, infoArray, 1);
if (cublasStatus == CUBLAS_STATUS_SUCCESS)
printf("cuBLAS DGETRF SUCCESSFUL! \n");
else
printf("cuBLAS DGETRF UNSUCCESSFUL! \n");
// cuSOLVER LU factorization
cusolverStatus = cusolverDnCreate(&cusolverHandle);
cusolverStatus = cusolverDnDgetrf(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, workArray, pivotArray, infoArray);
if (cusolverStatus == CUSOLVER_STATUS_SUCCESS)
printf("cuSOLVER DGETRF SUCCESSFUL! \n");
else
printf("cuSOLVER DGETRF UNSUCCESSFUL! \n");
return 0;
}
The output from the code above is
cuBLAS DGETRF SUCCESSFUL!
cuSOLVER DGETRF SUCCESSFUL!
When I try to do the same with LAPACK (warning: more ugly code!):
#include <iostream>
#include <vector>
using namespace std;
extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda, int* ipiv, int* info);
extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A, int *LDA, int *IPIV, double *B, int *LDB, int *INFO );
int main()
{
char trans = 'N';
int dim = 5;
int LDA = dim;
int info;
vector<double> a,b;
a.push_back(2.0); a.push_back(4.0); a.push_back(1.0); a.push_back(-3.0); a.push_back(0.0);
a.push_back(-1.0); a.push_back(-2.0); a.push_back(2.0); a.push_back(4.0); a.push_back(0.0);
a.push_back(4.0); a.push_back(2.0); a.push_back(-3.0); a.push_back(5.0); a.push_back(0.0);
a.push_back(5.0); a.push_back(-4.0); a.push_back(-3.0); a.push_back(1.0); a.push_back(0.0);
a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0);
int ipiv[5];
dgetrf_(&dim, &dim, &*a.begin(), &LDA, ipiv, &info);
if (info == 0)
printf("dgetrf successful\n");
else
printf("dgetrf unsuccessful\n");
return 0;
}
The output I get is
dgetrf unsuccessful
I understand that they are different libraries, but is this behaviour expected?
When I compile your CUDA code, I get a warning that the cusolver handle is being used before its value is set. You shouldn't ignore such warnings, because your usage in the sizing function is not correct. However that is not the problem here.
I don't think there's any difference between your two test cases. You seem to be interpreting the results incorrectly.
Looking at the netlib documentation, we see that an info
value of 5 mean U(5,5)
is zero, which would be problematic for future use. That doesn't mean the dgetrf
factorization was successful or unsuccessful as you are printing out, but instead it means something about your input data. In fact the factorization was completed, as clearly indicated in the docs.
Likewise, we get no information about that condition simply by looking at the function return value for the cusolver function. In order to discover information similar to what is being reported by lapack, its necessary to look at the infoArray
values.
With those changes, your codes are reporting the same thing (info value of 5):
$ cat t1556.cu
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main(int argc, char** argv){
const int matrixSize = 5;
int i, j;
double arrA[matrixSize][matrixSize] = {
{2.0, 4.0, 1.0, -3.0, 0.0},
{-1.0, -2.0, 2.0, 4.0, 0.0},
{4.0, 2.0, -3.0, 5.0, 0.0},
{5.0, -4.0, -3.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 0.0, 0.0}
};
double *arrADev, *workArray;
double **matrixArray;
int *pivotArray;
int *infoArray;
double flat[matrixSize*matrixSize] = {0};
cublasHandle_t cublasHandle;
cublasStatus_t cublasStatus;
cudaError_t error;
cudaError cudaStatus;
cusolverStatus_t cusolverStatus;
cusolverDnHandle_t cusolverHandle;
double *matrices[2];
error = cudaMalloc(&arrADev, sizeof(double) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&matrixArray, sizeof(double*) * 2);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&pivotArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&infoArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
cublasStatus = cublasCreate(&cublasHandle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//maps matrix to flat vector
for(i=0; i<matrixSize; i++){
for(j=0; j<matrixSize; j++){
flat[i+j*matrixSize] = arrA[i][j];
}
}
//copy matrix A to device
cublasStatus = cublasSetMatrix(matrixSize, matrixSize, sizeof(double), flat, matrixSize, arrADev, matrixSize);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//save matrix address
matrices[0] = arrADev;
//copy matrices references to device
error = cudaMemcpy(matrixArray, matrices, sizeof(double*)*1, cudaMemcpyHostToDevice);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
int Lwork;
// calculate buffer size for cuSOLVER LU factorization
cusolverStatus = cusolverDnCreate(&cusolverHandle);
cusolverStatus = cusolverDnDgetrf_bufferSize(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, &Lwork);
cudaStatus = cudaMalloc((void**)&workArray, Lwork*sizeof(double));
// cuBLAS LU factorization
cublasStatus = cublasDgetrfBatched(cublasHandle, matrixSize, matrixArray, matrixSize, pivotArray, infoArray, 1);
if (cublasStatus == CUBLAS_STATUS_SUCCESS)
printf("cuBLAS DGETRF SUCCESSFUL! \n");
else
printf("cuBLAS DGETRF UNSUCCESSFUL! \n");
// cuSOLVER LU factorization
cusolverStatus = cusolverDnDgetrf(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, workArray, pivotArray, infoArray);
if (cusolverStatus == CUSOLVER_STATUS_SUCCESS)
printf("cuSOLVER DGETRF SUCCESSFUL! \n");
else
printf("cuSOLVER DGETRF UNSUCCESSFUL! \n");
int *hinfoArray = (int *)malloc(matrixSize*matrixSize*sizeof(int));
cudaMemcpy(hinfoArray, infoArray, matrixSize*matrixSize*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < matrixSize*matrixSize; i++) printf("%d,", hinfoArray[i]);
printf("\n");
return 0;
}
$ nvcc -o t1556 t1556.cu -lcublas -lcusolver
t1556.cu(30): warning: variable "cudaStatus" was set but never used
$ ./t1556
cuBLAS DGETRF SUCCESSFUL!
cuSOLVER DGETRF SUCCESSFUL!
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
$ cat t1557.cpp
#include <iostream>
#include <vector>
#include <lapacke/lapacke.h>
using namespace std;
// extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda, int* ipiv, int* info);
// extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A, int *LDA, int *IPIV, double *B, int *LDB, int *INFO );
int main()
{
char trans = 'N';
int dim = 5;
int LDA = dim;
int info;
vector<double> a,b;
a.push_back(2.0); a.push_back(4.0); a.push_back(1.0); a.push_back(-3.0); a.push_back(0.0);
a.push_back(-1.0); a.push_back(-2.0); a.push_back(2.0); a.push_back(4.0); a.push_back(0.0);
a.push_back(4.0); a.push_back(2.0); a.push_back(-3.0); a.push_back(5.0); a.push_back(0.0);
a.push_back(5.0); a.push_back(-4.0); a.push_back(-3.0); a.push_back(1.0); a.push_back(0.0);
a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0);
int ipiv[5];
LAPACK_dgetrf(&dim, &dim, &*a.begin(), &LDA, ipiv, &info);
printf("info = %d\n", info);
if (info == 0)
printf("dgetrf successful\n");
else
printf("dgetrf unsuccessful\n");
return 0;
}
$ g++ t1557.cpp -o t1557 -llapack
$ ./t1557
info = 5
dgetrf unsuccessful
$
I'm using the lapack installed by centOS.
centOS 7, CUDA 10.1.243, Tesla V100.