Search code examples
c++cudacusolver

wrong results from cusolverDnDDgels


I wanted to test the cuda implementation of xgels provided with CUDA 11.1, and it seems I cannot make it work properly. For instance, this code seems to run just fine:

#include <armadillo>
#include <cusolverDn.h>
int main()
{
    const int m = 1000;
    const int n = 10;
    const int nrhs = 2;

    arma::mat A(m, n, arma::fill::randn);
    arma::mat B(m, nrhs, arma::fill::randn);

    A.col(0).fill(1.0);
    B += 10.0;
    const arma::mat refX = arma::solve(A, B);
    
    cusolverDnHandle_t handle;
    cusolverDnCreate(&handle);
    cusolverStatus_t status;
    const int lda = m;
    const int ldb = std::max(m, n);
    size_t l_work = 0;
    status = cusolverDnDDgels_bufferSize(
        handle,
        m, n, nrhs,
        NULL, lda,
        NULL, ldb,
        NULL, ldb,
        NULL, &l_work);
    std::cout <<"Workspace: " << l_work << "!***\n";
    //One if ok
    std::cout << "Find Workspace - 1 if ok: "<<(status == CUSOLVER_STATUS_SUCCESS)<<"!***\n";
    double* d_work;
    cudaMalloc(reinterpret_cast<void**>(&d_work), l_work);
    int* d_info;
    cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int));
    cudaMemset(d_info, 0, sizeof(int));
    double* dA, *dB, *dX;
    cudaMalloc(reinterpret_cast<void**>(&dA), A.n_elem* sizeof(double));
    cudaMalloc(reinterpret_cast<void**>(&dB), B.n_elem* sizeof(double));
    cudaMemcpy(dA,A.memptr(), A.n_elem * sizeof(double),cudaMemcpyHostToDevice);
    cudaMemcpy(dB, B.memptr(), B.n_elem * sizeof(double), cudaMemcpyHostToDevice);
    cudaMalloc(reinterpret_cast<void**>(&dX), refX.n_elem * sizeof(double));
    cudaMemset(dX, 0, refX.n_elem * sizeof(double));
    int iter = 0;
    status = cusolverDnDDgels(handle,
        m, n, nrhs,
        dA, lda,
        dB, ldb,
        dX, ldb,
        d_work, l_work,
        &iter, d_info);
    //One if ok
    std::cout << "Solve status - 1 if ok: " << (status == CUSOLVER_STATUS_SUCCESS) << "!***\n";
    int h_info = -1;
    cudaMemcpy(&h_info, d_info,sizeof(int),cudaMemcpyDeviceToHost);
    std::cout << "Iter: " <<iter << "!\n";
    //0 if ok
    std::cout << "Info - 0 if ok :" << h_info << "!\n";
    //Comparison of the results results
    arma::mat cudaX(refX.n_rows, refX.n_cols);
    cudaMemcpy(cudaX.memptr(), dX, cudaX.n_elem * sizeof(double), cudaMemcpyDeviceToHost);
    std::cout << "Armadillo result:\n" <<refX.t() <<"\n";
    std::cout << "cusolver result:\n" << cudaX.t() << "\n";

    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dX);
    cudaFree(d_work);
    cudaFree(d_info);
}

The results are unfortunately wrong, as only the first column seems ok:

Workspace: 3653888!***

Find Workspace - 1 if ok: 1!***

Solve status - 1 if ok: 1!***

Iter: -51!

Info - 0 if ok :0!

Armadillo result:

9.9965 -0.0198 0.0290 -0.0317 0.0027 -0.0197 0.0377 -0.0379 -0.0172 0.0088

9.9774 0.0485 0.0089 -0.0233 0.0054 -0.0257 0.0130 0.0080 0.0149 -0.0335

cusolver result:

9.9965 -0.0198 0.0290 -0.0317 0.0027 -0.0197 0.0377 -0.0379 -0.0172 0.0088

-0.8578 0.1884 0.5331 -0.8275 0.1992 -0.0587 1.0014 -0.0250 0.6571 -0.5516

If I run the cuda-memcheck, the first error I get is:

========= CUDA-MEMCHECK
========= Invalid __global__ write of size 8
=========     at 0x00001aa0 in void copy_AtoB_kernel<double>(int, int, double const *, int, double*, int)
=========     by thread (31,0,0) in block (15,0,0)
=========     Address 0xb00e9b2f8 is out of bounds
=========     Device Frame:void copy_AtoB_kernel<double>(int, int, double const *, int, double*, int) (void copy_AtoB_kernel<double>(int, int, double const *, int, double*, int) : 0x1aa0)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x751f4]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x75577]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x79cd9]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll (cuProfilerStop + 0x11ce4a) [0x32e5ba]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x16cfe5]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll (cuProfilerStop + 0xf1052) [0x3027c2]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x3841d]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x3890c]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x38be4]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll (cuLaunchKernel + 0x234) [0x201044]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll [0x4856]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll [0x22b4]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnIRSParamsSetTolInner + 0x2269) [0xda299]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnIRSParamsSetTolInner + 0x29f1) [0xdaa21]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnIRSParamsSetTolInner + 0xea66) [0xe6a96]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnZZgesv_bufferSize + 0x5b61) [0x103d41]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnIRSXgels + 0x4b6) [0x109cb6]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnZZgesv_bufferSize + 0x11e4) [0xff3c4]
=========     Host Frame:C:\sw_source\SEM.Maeve\build\Release\PerformanceTest.exe (main + 0x36c) [0x292d08c]
=========     Host Frame:C:\sw_source\SEM.Maeve\build\Release\PerformanceTest.exe (__scrt_common_main_seh + 0x10c) [0x2ce4378]
=========     Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17c24]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x6d4d1]

So it seems that the code has a memory error, but I cannot detect any error in my usage. Moreover if I comment out the call to cusolverDnDDgels the error disappears.

Any clue of what makes this code failing?

I execute the code on a rtx 2080TI, and the code is compiled with:

> nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Tue_Sep_15_19:12:04_Pacific_Daylight_Time_2020
Cuda compilation tools, release 11.1, V11.1.74
Build cuda_11.1.relgpu_drvr455TC455_06.29069683_0

in VisualStudio for Windows, using as options: compute_70,sm_70;compute_75,sm_75;


Solution

  • According to my testing, if you:

    1. Update to CUDA 11.1 Update 1 (so that nvcc --version reports 11.1.105)

    2. Change the lddx parameter to be equal to n:

      status = cusolverDnDDgels(handle,
       m, n, nrhs,
       dA, lda,
       dB, ldb,
       dX, n,   //change here and in the buffersize function from ldb to n
       ...
      

    then I get matching results between cusolver and armadillo.