I wanted to test the cuda implementation of xgels provided with CUDA 11.1, and it seems I cannot make it work properly. For instance, this code seems to run just fine:
#include <armadillo>
#include <cusolverDn.h>
int main()
{
const int m = 1000;
const int n = 10;
const int nrhs = 2;
arma::mat A(m, n, arma::fill::randn);
arma::mat B(m, nrhs, arma::fill::randn);
A.col(0).fill(1.0);
B += 10.0;
const arma::mat refX = arma::solve(A, B);
cusolverDnHandle_t handle;
cusolverDnCreate(&handle);
cusolverStatus_t status;
const int lda = m;
const int ldb = std::max(m, n);
size_t l_work = 0;
status = cusolverDnDDgels_bufferSize(
handle,
m, n, nrhs,
NULL, lda,
NULL, ldb,
NULL, ldb,
NULL, &l_work);
std::cout <<"Workspace: " << l_work << "!***\n";
//One if ok
std::cout << "Find Workspace - 1 if ok: "<<(status == CUSOLVER_STATUS_SUCCESS)<<"!***\n";
double* d_work;
cudaMalloc(reinterpret_cast<void**>(&d_work), l_work);
int* d_info;
cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int));
cudaMemset(d_info, 0, sizeof(int));
double* dA, *dB, *dX;
cudaMalloc(reinterpret_cast<void**>(&dA), A.n_elem* sizeof(double));
cudaMalloc(reinterpret_cast<void**>(&dB), B.n_elem* sizeof(double));
cudaMemcpy(dA,A.memptr(), A.n_elem * sizeof(double),cudaMemcpyHostToDevice);
cudaMemcpy(dB, B.memptr(), B.n_elem * sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc(reinterpret_cast<void**>(&dX), refX.n_elem * sizeof(double));
cudaMemset(dX, 0, refX.n_elem * sizeof(double));
int iter = 0;
status = cusolverDnDDgels(handle,
m, n, nrhs,
dA, lda,
dB, ldb,
dX, ldb,
d_work, l_work,
&iter, d_info);
//One if ok
std::cout << "Solve status - 1 if ok: " << (status == CUSOLVER_STATUS_SUCCESS) << "!***\n";
int h_info = -1;
cudaMemcpy(&h_info, d_info,sizeof(int),cudaMemcpyDeviceToHost);
std::cout << "Iter: " <<iter << "!\n";
//0 if ok
std::cout << "Info - 0 if ok :" << h_info << "!\n";
//Comparison of the results results
arma::mat cudaX(refX.n_rows, refX.n_cols);
cudaMemcpy(cudaX.memptr(), dX, cudaX.n_elem * sizeof(double), cudaMemcpyDeviceToHost);
std::cout << "Armadillo result:\n" <<refX.t() <<"\n";
std::cout << "cusolver result:\n" << cudaX.t() << "\n";
cudaFree(dA);
cudaFree(dB);
cudaFree(dX);
cudaFree(d_work);
cudaFree(d_info);
}
The results are unfortunately wrong, as only the first column seems ok:
Workspace: 3653888!***
Find Workspace - 1 if ok: 1!***
Solve status - 1 if ok: 1!***
Iter: -51!
Info - 0 if ok :0!
Armadillo result:
9.9965 -0.0198 0.0290 -0.0317 0.0027 -0.0197 0.0377 -0.0379 -0.0172 0.0088
9.9774 0.0485 0.0089 -0.0233 0.0054 -0.0257 0.0130 0.0080 0.0149 -0.0335
cusolver result:
9.9965 -0.0198 0.0290 -0.0317 0.0027 -0.0197 0.0377 -0.0379 -0.0172 0.0088
-0.8578 0.1884 0.5331 -0.8275 0.1992 -0.0587 1.0014 -0.0250 0.6571 -0.5516
If I run the cuda-memcheck, the first error I get is:
========= CUDA-MEMCHECK
========= Invalid __global__ write of size 8
========= at 0x00001aa0 in void copy_AtoB_kernel<double>(int, int, double const *, int, double*, int)
========= by thread (31,0,0) in block (15,0,0)
========= Address 0xb00e9b2f8 is out of bounds
========= Device Frame:void copy_AtoB_kernel<double>(int, int, double const *, int, double*, int) (void copy_AtoB_kernel<double>(int, int, double const *, int, double*, int) : 0x1aa0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x751f4]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x75577]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x79cd9]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll (cuProfilerStop + 0x11ce4a) [0x32e5ba]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x16cfe5]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll (cuProfilerStop + 0xf1052) [0x3027c2]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x3841d]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x3890c]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll [0x38be4]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nv_dispi.inf_amd64_8e1b465b962975a0\nvcuda64.dll (cuLaunchKernel + 0x234) [0x201044]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll [0x4856]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll [0x22b4]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnIRSParamsSetTolInner + 0x2269) [0xda299]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnIRSParamsSetTolInner + 0x29f1) [0xdaa21]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnIRSParamsSetTolInner + 0xea66) [0xe6a96]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnZZgesv_bufferSize + 0x5b61) [0x103d41]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnIRSXgels + 0x4b6) [0x109cb6]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\cusolver64_11.dll (cusolverDnZZgesv_bufferSize + 0x11e4) [0xff3c4]
========= Host Frame:C:\sw_source\SEM.Maeve\build\Release\PerformanceTest.exe (main + 0x36c) [0x292d08c]
========= Host Frame:C:\sw_source\SEM.Maeve\build\Release\PerformanceTest.exe (__scrt_common_main_seh + 0x10c) [0x2ce4378]
========= Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17c24]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x6d4d1]
So it seems that the code has a memory error, but I cannot detect any error in my usage. Moreover if I comment out the call to cusolverDnDDgels the error disappears.
Any clue of what makes this code failing?
I execute the code on a rtx 2080TI, and the code is compiled with:
> nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Tue_Sep_15_19:12:04_Pacific_Daylight_Time_2020
Cuda compilation tools, release 11.1, V11.1.74
Build cuda_11.1.relgpu_drvr455TC455_06.29069683_0
in VisualStudio for Windows, using as options: compute_70,sm_70;compute_75,sm_75;
According to my testing, if you:
Update to CUDA 11.1 Update 1 (so that nvcc --version
reports 11.1.105
)
Change the lddx
parameter to be equal to n
:
status = cusolverDnDDgels(handle,
m, n, nrhs,
dA, lda,
dB, ldb,
dX, n, //change here and in the buffersize function from ldb to n
...
then I get matching results between cusolver and armadillo.