Search code examples
cudalinear-algebralapackcublascusolver

LU factorization receives different results between LAPACK and cuBLAS/cuSOLVER


I am testing out some scenarios where the function dgetrf is returned differently when used with cuBLAS/cuSOLVER compared to writing for LAPACK. For example, I am looking at LU factorization of the following matrix:

2.0 4.0 1.0 -3.0 0.0

-1.0 -2.0 2.0 4.0 0.0

4.0 2.0 -3.0 5.0 0.0

5.0 -4.0 -3.0 1.0 0.0

0.0 0.0 0.0 0.0 0.0

I first try to call dgetrf from cuBLAS/cuSOLVER as followed (warning, ugly testing code ahead!)

    #include <cblas.h>
    #include <time.h>
    #include <stdio.h>
    #include <string.h>
    #include <cuda_runtime.h>
    #include <cublas_v2.h>
    #include <cusolverDn.h>

    int main(int argc, char** argv){

        const int matrixSize = 5;

        int i, j;
        double arrA[matrixSize][matrixSize] = {
            {2.0, 4.0, 1.0, -3.0, 0.0},
            {-1.0, -2.0, 2.0, 4.0, 0.0},
            {4.0, 2.0, -3.0, 5.0, 0.0},
            {5.0, -4.0, -3.0, 1.0, 0.0},
            {0.0, 0.0, 0.0, 0.0, 0.0}
        };

        double *arrADev, *workArray;
        double **matrixArray;
        int *pivotArray;
        int *infoArray;
        double flat[matrixSize*matrixSize] = {0};
        cublasHandle_t cublasHandle;
        cublasStatus_t cublasStatus;
        cudaError_t error;

        cudaError cudaStatus;
        cusolverStatus_t cusolverStatus;
        cusolverDnHandle_t cusolverHandle;

        double *matrices[2];


        error = cudaMalloc(&arrADev,  sizeof(double) * matrixSize*matrixSize);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        error = cudaMalloc(&matrixArray,  sizeof(double*) * 2);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        error = cudaMalloc(&pivotArray,  sizeof(int) * matrixSize*matrixSize);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        error = cudaMalloc(&infoArray,  sizeof(int) * matrixSize*matrixSize);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        cublasStatus = cublasCreate(&cublasHandle);
        if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);

        //maps matrix to flat vector
        for(i=0; i<matrixSize; i++){
            for(j=0; j<matrixSize; j++){
                flat[i+j*matrixSize] = arrA[i][j];
            }
        }

        //copy matrix A to device
        cublasStatus = cublasSetMatrix(matrixSize, matrixSize, sizeof(double), flat, matrixSize, arrADev, matrixSize);
        if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);

        //save matrix address
        matrices[0] = arrADev;

        //copy matrices references to device
        error = cudaMemcpy(matrixArray, matrices, sizeof(double*)*1, cudaMemcpyHostToDevice);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        int Lwork;
        // calculate buffer size for cuSOLVER LU factorization
        cusolverStatus = cusolverDnDgetrf_bufferSize(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, &Lwork);
        cudaStatus = cudaMalloc((void**)&workArray, Lwork*sizeof(double));

        // cuBLAS LU factorization
        cublasStatus = cublasDgetrfBatched(cublasHandle, matrixSize, matrixArray, matrixSize, pivotArray, infoArray, 1);
        if (cublasStatus == CUBLAS_STATUS_SUCCESS)
            printf("cuBLAS DGETRF SUCCESSFUL! \n");
        else
            printf("cuBLAS DGETRF UNSUCCESSFUL! \n");

        // cuSOLVER LU factorization
        cusolverStatus = cusolverDnCreate(&cusolverHandle);
        cusolverStatus = cusolverDnDgetrf(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, workArray, pivotArray, infoArray);
        if (cusolverStatus == CUSOLVER_STATUS_SUCCESS)
            printf("cuSOLVER DGETRF SUCCESSFUL! \n");
        else
            printf("cuSOLVER DGETRF UNSUCCESSFUL! \n");

        return 0;
    }

The output from the code above is

    cuBLAS DGETRF SUCCESSFUL!
    cuSOLVER DGETRF SUCCESSFUL!

When I try to do the same with LAPACK (warning: more ugly code!):

    #include <iostream>
    #include <vector>

    using namespace std;

    extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda, int* ipiv, int* info);
    extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A, int *LDA, int *IPIV, double *B, int *LDB, int *INFO );

    int main()
    {
       char trans = 'N';
       int dim = 5;
       int LDA = dim;
       int info;

       vector<double> a,b;

       a.push_back(2.0); a.push_back(4.0); a.push_back(1.0); a.push_back(-3.0); a.push_back(0.0);
       a.push_back(-1.0); a.push_back(-2.0); a.push_back(2.0); a.push_back(4.0); a.push_back(0.0);
       a.push_back(4.0); a.push_back(2.0); a.push_back(-3.0); a.push_back(5.0); a.push_back(0.0);
       a.push_back(5.0); a.push_back(-4.0); a.push_back(-3.0); a.push_back(1.0); a.push_back(0.0);
       a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0);

       int ipiv[5];
       dgetrf_(&dim, &dim, &*a.begin(), &LDA, ipiv, &info);
       if (info == 0)
           printf("dgetrf successful\n");
       else
           printf("dgetrf unsuccessful\n");

       return 0;
    }

The output I get is

    dgetrf unsuccessful

I understand that they are different libraries, but is this behaviour expected?


Solution

  • When I compile your CUDA code, I get a warning that the cusolver handle is being used before its value is set. You shouldn't ignore such warnings, because your usage in the sizing function is not correct. However that is not the problem here.

    I don't think there's any difference between your two test cases. You seem to be interpreting the results incorrectly.

    Looking at the netlib documentation, we see that an info value of 5 mean U(5,5) is zero, which would be problematic for future use. That doesn't mean the dgetrf factorization was successful or unsuccessful as you are printing out, but instead it means something about your input data. In fact the factorization was completed, as clearly indicated in the docs.

    Likewise, we get no information about that condition simply by looking at the function return value for the cusolver function. In order to discover information similar to what is being reported by lapack, its necessary to look at the infoArray values.

    With those changes, your codes are reporting the same thing (info value of 5):

    $ cat t1556.cu
        #include <time.h>
        #include <stdio.h>
        #include <string.h>
        #include <cuda_runtime.h>
        #include <cublas_v2.h>
        #include <cusolverDn.h>
    
        int main(int argc, char** argv){
    
            const int matrixSize = 5;
    
            int i, j;
            double arrA[matrixSize][matrixSize] = {
                {2.0, 4.0, 1.0, -3.0, 0.0},
                {-1.0, -2.0, 2.0, 4.0, 0.0},
                {4.0, 2.0, -3.0, 5.0, 0.0},
                {5.0, -4.0, -3.0, 1.0, 0.0},
                {0.0, 0.0, 0.0, 0.0, 0.0}
            };
    
            double *arrADev, *workArray;
            double **matrixArray;
            int *pivotArray;
            int *infoArray;
            double flat[matrixSize*matrixSize] = {0};
            cublasHandle_t cublasHandle;
            cublasStatus_t cublasStatus;
            cudaError_t error;
    
            cudaError cudaStatus;
            cusolverStatus_t cusolverStatus;
            cusolverDnHandle_t cusolverHandle;
    
            double *matrices[2];
    
    
            error = cudaMalloc(&arrADev,  sizeof(double) * matrixSize*matrixSize);
            if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
    
            error = cudaMalloc(&matrixArray,  sizeof(double*) * 2);
            if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
    
            error = cudaMalloc(&pivotArray,  sizeof(int) * matrixSize*matrixSize);
            if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
    
            error = cudaMalloc(&infoArray,  sizeof(int) * matrixSize*matrixSize);
            if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
    
            cublasStatus = cublasCreate(&cublasHandle);
            if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
    
            //maps matrix to flat vector
            for(i=0; i<matrixSize; i++){
                for(j=0; j<matrixSize; j++){
                    flat[i+j*matrixSize] = arrA[i][j];
                }
            }
    
            //copy matrix A to device
            cublasStatus = cublasSetMatrix(matrixSize, matrixSize, sizeof(double), flat, matrixSize, arrADev, matrixSize);
            if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
    
            //save matrix address
            matrices[0] = arrADev;
    
            //copy matrices references to device
            error = cudaMemcpy(matrixArray, matrices, sizeof(double*)*1, cudaMemcpyHostToDevice);
            if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
    
            int Lwork;
            // calculate buffer size for cuSOLVER LU factorization
            cusolverStatus = cusolverDnCreate(&cusolverHandle);
            cusolverStatus = cusolverDnDgetrf_bufferSize(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, &Lwork);
            cudaStatus = cudaMalloc((void**)&workArray, Lwork*sizeof(double));
    
            // cuBLAS LU factorization
            cublasStatus = cublasDgetrfBatched(cublasHandle, matrixSize, matrixArray, matrixSize, pivotArray, infoArray, 1);
            if (cublasStatus == CUBLAS_STATUS_SUCCESS)
                printf("cuBLAS DGETRF SUCCESSFUL! \n");
            else
                printf("cuBLAS DGETRF UNSUCCESSFUL! \n");
    
            // cuSOLVER LU factorization
            cusolverStatus = cusolverDnDgetrf(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, workArray, pivotArray, infoArray);
            if (cusolverStatus == CUSOLVER_STATUS_SUCCESS)
                printf("cuSOLVER DGETRF SUCCESSFUL! \n");
            else
                printf("cuSOLVER DGETRF UNSUCCESSFUL! \n");
            int *hinfoArray = (int *)malloc(matrixSize*matrixSize*sizeof(int));
            cudaMemcpy(hinfoArray, infoArray, matrixSize*matrixSize*sizeof(int), cudaMemcpyDeviceToHost);
            for (int i = 0; i < matrixSize*matrixSize; i++) printf("%d,", hinfoArray[i]);
            printf("\n");
            return 0;
        }
    $ nvcc -o t1556 t1556.cu -lcublas -lcusolver
    t1556.cu(30): warning: variable "cudaStatus" was set but never used
    
    $ ./t1556
    cuBLAS DGETRF SUCCESSFUL!
    cuSOLVER DGETRF SUCCESSFUL!
    5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    $ cat t1557.cpp
        #include <iostream>
        #include <vector>
        #include <lapacke/lapacke.h>
        using namespace std;
    
    //    extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda, int* ipiv, int* info);
    //    extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A, int *LDA, int *IPIV, double *B, int *LDB, int *INFO );
    
        int main()
        {
           char trans = 'N';
           int dim = 5;
           int LDA = dim;
           int info;
    
           vector<double> a,b;
    
           a.push_back(2.0); a.push_back(4.0); a.push_back(1.0); a.push_back(-3.0); a.push_back(0.0);
           a.push_back(-1.0); a.push_back(-2.0); a.push_back(2.0); a.push_back(4.0); a.push_back(0.0);
           a.push_back(4.0); a.push_back(2.0); a.push_back(-3.0); a.push_back(5.0); a.push_back(0.0);
           a.push_back(5.0); a.push_back(-4.0); a.push_back(-3.0); a.push_back(1.0); a.push_back(0.0);
           a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0);
    
           int ipiv[5];
           LAPACK_dgetrf(&dim, &dim, &*a.begin(), &LDA, ipiv, &info);
           printf("info = %d\n", info);
           if (info == 0)
               printf("dgetrf successful\n");
           else
               printf("dgetrf unsuccessful\n");
    
           return 0;
        }
    $ g++ t1557.cpp -o t1557 -llapack
    $ ./t1557
    info = 5
    dgetrf unsuccessful
    $
    

    I'm using the lapack installed by centOS.

    centOS 7, CUDA 10.1.243, Tesla V100.