Search code examples
cudacublas

How to format the A matrix for CUBLAS routine cublasdtbsv?


I'm new to using the Cuda libraries and would like to solve symmetric banded matrix equation. I found sample code to solve this using LU Factorization. I am now trying to use cudaBlas routine cublasdtbsv to solve the equations. I was not able to find sample code for this function and have put together my own solution. The problem that I believe I'm having is that I do not understand the correct way to input the A matrix for this routine. Here is my sample code for a very simple 3x3 matrix with one right hand side. It includes the correct solution using LU Factorization and my attempt to use cublasdtbsv routine:

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>

void test();
void printMatrix2(int m, int n, const double* A, int lda, const char* name);
int LUFactorizationSolver2();
int TriBandedSymSolver2();

int main(int argc, char* argv[])
{
    test();

    return 0;

}

void test()
{
    LUFactorizationSolver2();
    TriBandedSymSolver2();

    return;
}

int TriBandedSymSolver2()
{
    printf("\n****      example of cublasDtbsv \n\n");

    const int n = 3;
    const int ldm = n;
    const int k = 1;// n - 1;
    const int lda = n;
    const int nrhs = 1;
    const int incx = 1;

    double M[ldm * n] = { 1.0, 0.0, 0.0
                        , 0.0, 2.0, 3.0
                        , 0.0, 3.0, 4.0
                        };

    double A[lda * n] = { 1.0, 2.0, 4.0
                        , 0.0, 3.0, 0.0
                        , 0.0, 0.0, 0.0
                        };

    double x[n * nrhs] = { 00.0
                         , 40.0
                         , 00.0
                         };

    cublasHandle_t cublasHandle = NULL;
    cudaStream_t stream = NULL;

    cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;
    cudaError_t cudaStat2 = cudaSuccess;
    cudaError_t cudaStat3 = cudaSuccess;
    cudaError_t cudaStat4 = cudaSuccess;

    double* d_A = NULL; /* device copy of A */
    double* d_x = NULL; /* device copy of x */

    printf("example of tbsv \n");

    printf("A = (matlab base-1)\n");
    printMatrix2(n, n, A, lda, "A");
    printf("=====\n");

    printf("x (b) = (matlab base-1)\n");
    printMatrix2(n, nrhs, x, nrhs, "x");
    printf("=====\n");

    /* step 1: create cusolver handle, bind a stream */
    cublasStatus = cublasCreate(&cublasHandle);
    assert(CUBLAS_STATUS_SUCCESS == cublasStatus);

    /* step 2: copy A to device */
    cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * n);
    cudaStat2 = cudaMalloc((void**)&d_x, sizeof(double) * n * nrhs);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);

    cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * n, cudaMemcpyHostToDevice);
    cudaStat2 = cudaMemcpy(d_x, x, sizeof(double) * n * nrhs, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);

    /*
     * step 5: solve A*x = b
     *
     */
    cublasStatus = cublasDtbsv(cublasHandle
        , CUBLAS_FILL_MODE_LOWER
        , CUBLAS_OP_N
        , CUBLAS_DIAG_NON_UNIT
        , n
        , k
        , d_A
        , lda
        , d_x
        , incx
    );
    cudaStat1 = cudaDeviceSynchronize();
    assert(CUBLAS_STATUS_SUCCESS == cublasStatus);

    cudaStat1 = cudaMemcpy(x, d_x, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
    assert(cudaSuccess == cudaStat1);

    printf("X = (matlab base-1)\n");
    printMatrix2(n, nrhs, x, nrhs, "x");
    printf("=====\n");

    /* free resources */
    if (d_A) cudaFree(d_A);
    if (d_x) cudaFree(d_x);

    if (cublasHandle) cublasDestroy(cublasHandle);
    if (stream) cudaStreamDestroy(stream);

    cudaDeviceReset();

    return 0;

}

int LUFactorizationSolver2()
{
    printf("\n****      example of cusolverDnDgetrs \n\n");

    cusolverDnHandle_t cusolverH = NULL;
    cudaStream_t stream = NULL;

    cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;
    cudaError_t cudaStat2 = cudaSuccess;
    cudaError_t cudaStat3 = cudaSuccess;
    cudaError_t cudaStat4 = cudaSuccess;

    const int m = 3;
    const int lda = m;
    const int nrhs = 1; // number of right-hand sides
    const int ldb = m;


    double A[lda * m] =  { 1.0, 0.0, 0.0
                         , 0.0, 2.0, 3.0
                         , 0.0, 3.0, 4.0
                         };

    double B[m * nrhs] = { 00.0
                         , 40.0
                         , 00.0
                         };
    double X[m * nrhs]; /* X = A\B */
    double LU[lda * m]; /* L and U */
    int Ipiv[m];      /* host copy of pivoting sequence */
    int info = 0;     /* host copy of error info */

    double* d_A = NULL; /* device copy of A */
    double* d_B = NULL; /* device copy of B */
    int* d_Ipiv = NULL; /* pivoting sequence */
    int* d_info = NULL; /* error info */
    int  lwork = 0;     /* size of workspace */
    double* d_work = NULL; /* device workspace for getrf */

    const int pivot_on = 0; // 1;

    if (pivot_on) {
        printf("pivot is on : compute P*A = L*U \n");
    }
    else {
        printf("pivot is off: compute A = L*U (not numerically stable)\n");
    }

    printf("A = (matlab base-1)\n");
    printMatrix2(m, m, A, lda, "A");
    printf("=====\n");

    printf("B = (matlab base-1)\n");
    printMatrix2(m, nrhs, B, ldb, "B");
    printf("=====\n");

    /* step 1: create cusolver handle, bind a stream */
    status = cusolverDnCreate(&cusolverH);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    assert(cudaSuccess == cudaStat1);

    status = cusolverDnSetStream(cusolverH, stream);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    /* step 2: copy A to device */
    cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * m);
    cudaStat2 = cudaMalloc((void**)&d_B, sizeof(double) * m * nrhs);
    cudaStat2 = cudaMalloc((void**)&d_Ipiv, sizeof(int) * m);
    cudaStat4 = cudaMalloc((void**)&d_info, sizeof(int));
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);
    assert(cudaSuccess == cudaStat4);

    cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * m, cudaMemcpyHostToDevice);
    cudaStat2 = cudaMemcpy(d_B, B, sizeof(double) * m * nrhs, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);

    /* step 3: query working space of getrf */
    status = cusolverDnDgetrf_bufferSize(
        cusolverH,
        m,
        m,
        d_A,
        lda,
        &lwork);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double) * lwork);
    assert(cudaSuccess == cudaStat1);

    /* step 4: LU factorization */
    if (pivot_on) {
        status = cusolverDnDgetrf(
            cusolverH,
            m,
            m,
            d_A,
            lda,
            d_work,
            d_Ipiv,
            d_info);
    }
    else {
        status = cusolverDnDgetrf(
            cusolverH,
            m,
            m,
            d_A,
            lda,
            d_work,
            NULL,
            d_info);
    }
    cudaStat1 = cudaDeviceSynchronize();
    assert(CUSOLVER_STATUS_SUCCESS == status);
    assert(cudaSuccess == cudaStat1);

    if (pivot_on) {
        cudaStat1 = cudaMemcpy(Ipiv, d_Ipiv, sizeof(int) * m, cudaMemcpyDeviceToHost);
    }
    cudaStat2 = cudaMemcpy(LU, d_A, sizeof(double) * lda * m, cudaMemcpyDeviceToHost);
    cudaStat3 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);

    if (0 > info) {
        printf("%d-th parameter is wrong \n", -info);
        exit(1);
    }
    if (pivot_on) {
        printf("pivoting sequence, matlab base-1\n");
        for (int j = 0; j < m; j++) {
            printf("Ipiv(%d) = %d\n", j + 1, Ipiv[j]);
        }
    }
    printf("L and U = (matlab base-1)\n");
    printMatrix2(m, m, LU, lda, "LU");
    printf("=====\n");

    /*
     * step 5: solve A*X = B
     *
     */
    if (pivot_on) {
        status = cusolverDnDgetrs(
            cusolverH,
            CUBLAS_OP_N,
            m,
            nrhs, /* nrhs */
            d_A,
            lda,
            d_Ipiv,
            d_B,
            ldb,
            d_info);
    }
    else {
        status = cusolverDnDgetrs(
            cusolverH,
            CUBLAS_OP_N,
            m,
            nrhs, /* nrhs */
            d_A,
            lda,
            NULL,
            d_B,
            ldb,
            d_info);
    }
    cudaStat1 = cudaDeviceSynchronize();
    assert(CUSOLVER_STATUS_SUCCESS == status);
    assert(cudaSuccess == cudaStat1);

    cudaStat1 = cudaMemcpy(X, d_B, sizeof(double) * m * nrhs, cudaMemcpyDeviceToHost);
    assert(cudaSuccess == cudaStat1);

    printf("X = (matlab base-1)\n");
    printMatrix2(m, nrhs, X, ldb, "X");
    printf("=====\n");

    /* free resources */
    if (d_A) cudaFree(d_A);
    if (d_B) cudaFree(d_B);
    if (d_Ipiv) cudaFree(d_Ipiv);
    if (d_info) cudaFree(d_info);
    if (d_work) cudaFree(d_work);

    if (cusolverH) cusolverDnDestroy(cusolverH);
    if (stream) cudaStreamDestroy(stream);

    cudaDeviceReset();

    return 0;

}

void printMatrix2(int m, int n, const double* A, int lda, const char* name)
{
    printf("%18s", "");
    for (int col = 0; col < n; col++) { printf("%7s(*,%2d)       ", name, col + 1); }
    printf("\n");

    for (int row = 0; row < m; row++) {
        printf("%4s(%2d,*) = ", name, row + 1);
        for (int col = 0; col < n; col++) {
            double Areg = A[row + col * lda];
            printf("%20.9f", Areg);
        }
        printf("\n");
    }
    return;
}

The correct answer should be:

   0.00
-160.00
 120.00

But I get:

 0.000000000
         inf
        -inf

I'm developing this on Windows 10 using Visual Studio 2019.

What am I missing or can someone point me to a working sample for the cublasDtbsv routine?


Solution

  • CUBLAS tbsv is a banded triangular solver. It expects your M matrix to be banded and triangular. If you would like to see what that looks like, this is a good reference.

    Your M matrix is not triangular. A triangular matrix would either have the upper triangular part (not including main diagonal) or lower triangular part (not including main diagonal) as all zeros. Your M matrix does not fit that definition.

    A banded triangular matrix M might look like this:

        | 2.0 0.0 0.0 |
    M = | 1.0 1.0 0.0 |
        | 0.0 1.0 1.0 |
    

    Let's use that as our example, with a RHS of | 2.0 2.0 2.0 |, and use the suggestions given for A matrix formatting here. In that case our A matrix would look like:

    A = | 2.0 1.0 1.0 |    (the main diagonal of M)
        | 1.0 1.0 0.0 |    (the first sub-diagonal of M)
    

    In that case our A matrix has 2 rows, and therefore the leading dimension of A is given by lda = 2

    If we put all of that into your test framework, it seems to give the correct result:

    $ cat t158.cu
    #include <stdio.h>
    #include <stdlib.h>
    #include <assert.h>
    #include <cuda_runtime.h>
    #include <cublas_v2.h>
    
    void printMatrix2(int m, int n, const double* A, int lda, const char* name)
    {
        printf("%18s", "");
        for (int col = 0; col < n; col++) { printf("%7s(*,%2d)       ", name, col + 1); }
        printf("\n");
    
        for (int row = 0; row < m; row++) {
            printf("%4s(%2d,*) = ", name, row + 1);
            for (int col = 0; col < n; col++) {
                double Areg = A[row + col * lda];
                printf("%20.9f", Areg);
            }
            printf("\n");
        }
        return;
    }
    
    
    
    int main(int argc, char* argv[])
    {
        printf("\n****      example of cublasDtbsv \n\n");
    
        const int n = 3;
    //    const int ldm = n;
        const int k = 1;
        const int lda = k+1;
        const int nrhs = 1;
        const int incx = 1;
    /*
        double M[ldm * n] = { 2.0, 0.0, 0.0
                            , 1.0, 1.0, 0.0
                            , 0.0, 1.0, 1.0
                            };
    */
        double A[lda * n] = { 2.0, 1.0, 1.0
                            , 1.0, 1.0, 0.0
                            };
    
        double x[n * nrhs] = { 2.0
                             , 2.0
                             , 2.0
                             };
    
        cublasHandle_t cublasHandle = NULL;
    
        cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
        cudaError_t cudaStat1 = cudaSuccess;
        cudaError_t cudaStat2 = cudaSuccess;
    
        double* d_A = NULL; /* device copy of A */
        double* d_x = NULL; /* device copy of x */
    
        printf("example of tbsv \n");
    
        printf("A = (matlab base-1)\n");
        printMatrix2(n, n, A, lda, "A");
        printf("=====\n");
    
        printf("x (b) = (matlab base-1)\n");
        printMatrix2(n, nrhs, x, nrhs, "x");
        printf("=====\n");
    
        /* step 1: create cublas handle */
        cublasStatus = cublasCreate(&cublasHandle);
        assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
    
        /* step 2: copy A to device */
        cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * n);
        cudaStat2 = cudaMalloc((void**)&d_x, sizeof(double) * n * nrhs);
        assert(cudaSuccess == cudaStat1);
        assert(cudaSuccess == cudaStat2);
    
        cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * n, cudaMemcpyHostToDevice);
        cudaStat2 = cudaMemcpy(d_x, x, sizeof(double) * n * nrhs, cudaMemcpyHostToDevice);
        assert(cudaSuccess == cudaStat1);
        assert(cudaSuccess == cudaStat2);
    
        /*
         * step 5: solve A*x = b
         *
         */
        cublasStatus = cublasDtbsv(cublasHandle
            , CUBLAS_FILL_MODE_LOWER
            , CUBLAS_OP_N
            , CUBLAS_DIAG_NON_UNIT
            , n
            , k
            , d_A
            , lda
            , d_x
            , incx
        );
        cudaStat1 = cudaDeviceSynchronize();
        assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
    
        cudaStat1 = cudaMemcpy(x, d_x, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
        assert(cudaSuccess == cudaStat1);
    
        printf("X = (matlab base-1)\n");
        printMatrix2(n, nrhs, x, nrhs, "x");
        printf("=====\n");
        /* free resources */
        if (d_A) cudaFree(d_A);
        if (d_x) cudaFree(d_x);
    
        if (cublasHandle) cublasDestroy(cublasHandle);
    
        return 0;
    }
    $ nvcc -o t158 t158.cu -lcublas
    $ ./t158
    
    ****      example of cublasDtbsv
    
    example of tbsv
    A = (matlab base-1)
                            A(*, 1)             A(*, 2)             A(*, 3)
       A( 1,*) =          2.000000000         1.000000000         1.000000000
       A( 2,*) =          1.000000000         1.000000000         0.000000000
       A( 3,*) =          1.000000000         1.000000000         0.000000000
    =====
    x (b) = (matlab base-1)
                            x(*, 1)
       x( 1,*) =          2.000000000
       x( 2,*) =          2.000000000
       x( 3,*) =          2.000000000
    =====
    X = (matlab base-1)
                            x(*, 1)
       x( 1,*) =          1.000000000
       x( 2,*) =          1.000000000
       x( 3,*) =          1.000000000
    =====
    $