Search code examples
c++cudalapackleast-squarescusolver

Testing CUDA 11 cusolverDnDSgels()


Trying to make sense of cusolverDnDSgels function. If I run it with simple 3x3 example as in the docs it works, but when I run it with my data then d_info returns -1 which as the docs says if d_info = -i then i-th argument is not valid.

Bellow I posted the code with 3 by 3 and 4 by 3 matrices where the former works and second doesn't.

As a reference I used this web site calculator https://adrianstoll.com/linear-algebra/least-squares.html

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cusolverDn.h>


void printMatrix(int m, int n, const double* A, int lda, const char* name)
{
    for (int row = 0; row < m; row++) {
        for (int col = 0; col < n; col++) {
            double Areg = A[row + col * lda];
            printf("%s(%d,%d) = %f\n", name, row + 1, col + 1, Areg);
        }
    }
}

int main(int argc, char*argv[])
{
    // 3x3 example works fine
    int m = 3;
    int n = 3;
    double A[9] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0 };
    double B[3] = { 6.0, 15.0, 4.0 };
    
    // 4x3 example d_info/info_gpu returns -1
    //int m = 4;
    //int n = 3;
    //double A[12] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0, 5.0, 1.0, 2.0 };
    //double B[4] = { 6.0, 15.0, 4.0, 5.0 };
    
    double X[3];
    
    int lda = m;
    int ldb = m;
    int ldx = n;
    int nrhs = 1;
    int niter = 0;
    int info_gpu = 0;
    size_t lwork = 0;
    
    double *d_A = NULL;
    double *d_B = NULL;
    double *d_X = NULL;
    double *d_work = NULL;
    int* d_info = NULL;
    
    cusolverDnHandle_t cusolverH = NULL;
    cudaError_t cudaStat = cudaSuccess;
    cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
    
    cusolver_status = cusolverDnCreate(&cusolverH);
    assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
    
    // Allocate space in the GPU
    cudaStat = cudaMalloc((void**)&d_A, sizeof(double) * m * n);
    assert(cudaSuccess == cudaStat);
    
    cudaStat = cudaMalloc((void**)&d_B, sizeof(double) * m * nrhs);
    assert(cudaSuccess == cudaStat);
    
    cudaStat = cudaMalloc((void**)&d_X, sizeof(double) * n * nrhs);
    assert(cudaSuccess == cudaStat);
    
    cudaStat = cudaMalloc((void**)&d_info, sizeof(int));
    assert(cudaSuccess == cudaStat);
    
    // Copy matrices into GPU space
    cudaStat = cudaMemcpy(d_A, A, sizeof(double) * m * n, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat);
    cudaStat = cudaMemcpy(d_B, B, sizeof(double) * m * nrhs, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat);
    
    // Get work buffer size
    cusolver_status = cusolverDnDSgels_bufferSize(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, &lwork);
    assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
    
    // Allocate workspace
    cudaStat = cudaMalloc((void**)&d_work, sizeof(float) * lwork);
    assert(cudaSuccess == cudaStat);
    
    // Run solver
    cusolver_status = cusolverDnDSgels(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, lwork, &niter, d_info);
    
    // Sync threads
    cudaStat = cudaDeviceSynchronize();
    assert(cudaSuccess == cudaStat);
    
    // Copy GPU info
    cudaStat = cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost);
    assert(cudaSuccess == cudaStat);
    
    // Get solved data
    cudaStat = cudaMemcpy(X, d_X, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
    assert(cudaSuccess == cudaStat);
    
    printf("after DDgels: info_gpu = %d\n", info_gpu);
    printMatrix(n, nrhs, X, ldx, "X");
    
    assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
    
    if (d_A) cudaFree(d_A);
    if (d_B) cudaFree(d_B);
    if (d_X) cudaFree(d_X);
    if (d_info) cudaFree(d_info);
    if (d_work) cudaFree(d_work);
    if (cusolverH) cusolverDnDestroy(cusolverH);
    cudaDeviceReset();
    return 0;
}

Solution

  • Unfortunately, there is an inconsistency in cuSolver setting creating this issue. There is a way to avoid such issue by calling the expert API "cusolverDnIRSXgels" "cusolverDnIRSXgels_bufferSize" that give the user more control.

    Thus in your code replace

        cusolver_status = cusolverDnDDgels_bufferSize(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, &lwork);
        assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
    
        // Allocate workspace
        cudaStat = cudaMalloc((void**)&d_work, lwork);
        assert(cudaSuccess == cudaStat);
    
        // Run solver
        cusolver_status = cusolverDnDDgels(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, lwork, &niter, d_info);
        printf("gels status: %d\n", int(cusolver_status));
    

    by

        // create the params and info structure for the expert interface
        cusolverDnIRSParams_t gels_irs_params;
        cusolverDnIRSParamsCreate( &gels_irs_params );
        cusolverDnIRSInfos_t gels_irs_infos;
        cusolverDnIRSInfosCreate( &gels_irs_infos );
    
        // Set the main and the low precision of the solver DSgels 
        // D is for double S for single precision thus 
        // main_precision is CUSOLVER_R_FP64, low_precision is CUSOLVER_R_FP32
        cusolverDnIRSParamsSetSolverPrecisions( gels_irs_params, CUSOLVER_R_64F, CUSOLVER_R_32F );
        // Set the refinement solver.
        cusolverDnIRSParamsSetRefinementSolver( gels_irs_params, CUSOLVER_IRS_REFINE_CLASSICAL );
        // Get work buffer size
        cusolver_status = cusolverDnIRSXgels_bufferSize(cusolverH, gels_irs_params, m, n, nrhs, &lwork);
        assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
        // Allocate workspace
        cudaStat = cudaMalloc((void**)&d_work, lwork);
        assert(cudaSuccess == cudaStat);
        // Run solver
        cusolver_status = cusolverDnIRSXgels(cusolverH, gels_irs_params, gels_irs_infos, m, n, nrhs, (void *)d_A, lda, (void *)d_B, ldb, (void *)d_X, ldx, d_work, lwork, &niter, d_info);
        printf("gels status: %d\n", int(cusolver_status));
    

    Also note that when m>n it is an oversubscribed system of equation so you cannot choose RHS and then find SO, it is better for your test to choose a SOL, generate RHS=A*SOL then solve using the RHS and compare to SOL.

    Also note that LDX should be >= max(m,n)

    I modified your code by the following:

    #include <stdio.h>
    #include <stdlib.h>
    #include <assert.h>
    
    #include <cuda_runtime.h>
    #include <cusolverDn.h>
    
    
    #define USE_BUG
    typedef double mt;
    
    #ifndef max
    #define max(a, b) ((a) > (b) ? (a) : (b))
    #endif
    
    void matvec(int m, int n, int nrhs, const mt* A, int lda, mt *X, int ldx, mt *B, int ldb)
    {
        mt sum[nrhs];
    
        for (int row = 0; row < m; row++) {
            for (int r = 0; r < nrhs; r++) sum[r] = 0.0;
            for (int col = 0; col < n; col++) {
                for (int r = 0; r < nrhs; r++){
                    sum[r] += A[row + col * lda] * X[col + r*ldx];
                }
            }
            for (int r = 0; r < nrhs; r++) B[row + r*ldb] = sum[r];
        }
    }
    
    mt check_solution(int n, int nrhs, mt *ref, int ldr, mt *X, int ldx)
    {
        mt error=0.0;
        for (int r = 0; r < nrhs; r++){
            for (int i = 0; i < n; i++) {
                error = max(error, abs(ref[i+r*ldr] - X[i+r*ldr]));
            }
        }
        return error;
    }
    
    
    void printMatrix(int m, int n, const mt* A, int lda, const char* name)
    {
        for (int row = 0; row < m; row++) {
            for (int col = 0; col < n; col++) {
                mt Areg = A[row + col * lda];
                printf("%s(%d,%d) = %f\n", name, row + 1, col + 1, Areg);
            }
        }
    }
    
    
    
    
    
    int main(int argc, char*argv[])
    {
    #ifndef USE_BUG
            // 3x3 example works fine
        const int m = 3;
        const int n = 3;
        mt A[m*n] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0 };
        mt sol[n] = { 6.0, 15.0, 4.0 };
    #else
        // 4x3 example d_info/info_gpu returns -1
        const int m = 4;
        const int n = 3;
        mt A[m*n] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0, 5.0, 1.0, 2.0 };
        mt sol[n] =   { 6.0, 15.0, 4.0 };
    #endif
        mt X[n];
        mt B[m];
    
        int lda = m;
        int ldb = max(m,n);
        int ldx = max(m,n);
        int nrhs = 1;
        int niter = 0;
        int info_gpu = 0;
        size_t lwork = 0;
    
        mt *d_A = NULL;
        mt *d_B = NULL;
        mt *d_X = NULL;
        mt *d_work = NULL;
        int* d_info = NULL;
    
        // compute B = A*sol
        matvec(m, n, nrhs, A, lda, sol, ldx, B, ldb);
    
        cusolverDnHandle_t cusolverH = NULL;
        cudaError_t cudaStat = cudaSuccess;
        cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
    
        cusolver_status = cusolverDnCreate(&cusolverH);
        assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
    
        // Allocate space in the GPU
        cudaStat = cudaMalloc((void**)&d_A, sizeof(mt) * m * n);
        assert(cudaSuccess == cudaStat);
    
        cudaStat = cudaMalloc((void**)&d_B, sizeof(mt) * m * nrhs);
        assert(cudaSuccess == cudaStat);
    
        cudaStat = cudaMalloc((void**)&d_X, sizeof(mt) * n * nrhs);
        assert(cudaSuccess == cudaStat);
    
        cudaStat = cudaMalloc((void**)&d_info, sizeof(int));
        assert(cudaSuccess == cudaStat);
    
        // Copy matrices into GPU space
        cudaStat = cudaMemcpy(d_A, A, sizeof(mt) * m * n, cudaMemcpyHostToDevice);
        assert(cudaSuccess == cudaStat);
        cudaStat = cudaMemcpy(d_B, B, sizeof(mt) * m * nrhs, cudaMemcpyHostToDevice);
        assert(cudaSuccess == cudaStat);
    
        #if 1
        // =======================================================
        // create the params and info structure for the expert interface
        cusolverDnIRSParams_t gels_irs_params;
        cusolverDnIRSParamsCreate( &gels_irs_params );
        cusolverDnIRSInfos_t gels_irs_infos;
        cusolverDnIRSInfosCreate( &gels_irs_infos );
    
        // Set the main and the low precision of the solver DSgels 
        // D is for double S for single precision thus 
        // main_precision is CUSOLVER_R_FP64, low_precision is CUSOLVER_R_FP32
        cusolverDnIRSParamsSetSolverPrecisions( gels_irs_params, CUSOLVER_R_64F, CUSOLVER_R_32F );
        // Set the refinement solver.
        cusolverDnIRSParamsSetRefinementSolver( gels_irs_params, CUSOLVER_IRS_REFINE_CLASSICAL );
        // Get work buffer size
        cusolver_status = cusolverDnIRSXgels_bufferSize(cusolverH, gels_irs_params, m, n, nrhs, &lwork);
        assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
        // Allocate workspace
        cudaStat = cudaMalloc((void**)&d_work, lwork);
        assert(cudaSuccess == cudaStat);
        // Run solver
        cusolver_status = cusolverDnIRSXgels(cusolverH, gels_irs_params, gels_irs_infos, m, n, nrhs, (void *)d_A, lda, (void *)d_B, ldb, (void *)d_X, ldx, d_work, lwork, &niter, d_info);
        printf("gels status: %d\n", int(cusolver_status));
        #else
    
        // Get work buffer size
        cusolver_status = cusolverDnDDgels_bufferSize(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, &lwork);
        assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
    
        // Allocate workspace
        cudaStat = cudaMalloc((void**)&d_work, lwork);
        assert(cudaSuccess == cudaStat);
    
        // Run solver
        cusolver_status = cusolverDnDDgels(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, lwork, &niter, d_info);
        printf("gels status: %d\n", int(cusolver_status));
        #endif
    
        // Sync threads
        cudaStat = cudaDeviceSynchronize();
        assert(cudaSuccess == cudaStat);
    
        // Copy GPU info
        cudaStat = cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost);
        assert(cudaSuccess == cudaStat);
    
        // Get solved data
        cudaStat = cudaMemcpy(X, d_X, sizeof(mt) * n * nrhs, cudaMemcpyDeviceToHost);
        assert(cudaSuccess == cudaStat);
    
        printf("after gels: info_gpu = %d\n", info_gpu);
        printf("after gels: niter    = %d\n", niter);
        printf("after gels: error    = %e\n", check_solution(n, nrhs, sol, ldx, X, ldx));
        printMatrix(3, nrhs, X, ldx, "X");
    
    
        if (d_A) cudaFree(d_A);
        if (d_B) cudaFree(d_B);
        if (d_X) cudaFree(d_X);
        if (d_info) cudaFree(d_info);
        if (d_work) cudaFree(d_work);
        if (cusolverH) cusolverDnDestroy(cusolverH);
        cudaDeviceReset();
        return 0;
    }
    

    compile using nvcc -o test test.cu -lcusolver