Search code examples
pythoncudapycudascikitscusolver

getrs function of cuSolver over pycuda doesn't work properly


I'm trying to make a pycuda wrapper inspired by scikits-cuda library for some operations provided in the new cuSolver library of Nvidia. I want to solve a linear system of the form AX=B by LU factorization, to perform that first use the cublasSgetrfBatched method from scikits-cuda, that give me the factorization LU; then with that factorization I want to solve the system using cusolverDnSgetrs from cuSolve that I want to wrap, when I perform the computation return status 3, the matrices that supose to give me the answer don't change, BUT the *devInfo is zero, looking in the cusolver's documentation says:

CUSOLVER_STATUS_INVALID_VALUE=An unsupported value or parameter was passed to the function (a negative vector size, for example).


libcusolver.cusolverDnSgetrs.restype=int
libcusolver.cusolverDnSgetrs.argtypes=[_types.handle,
                                   ctypes.c_char,
                                   ctypes.c_int,
                                   ctypes.c_int,
                                   ctypes.c_void_p,
                                   ctypes.c_int,
                                   ctypes.c_void_p,
                                   ctypes.c_void_p,
                                   ctypes.c_int,
                                   ctypes.c_void_p]

"""
handle is the handle pointer given by calling cusolverDnCreate() from cuSolver
LU is the LU factoriced matrix given by cublasSgetrfBatched() from scikits
P is the pivots matrix given by cublasSgetrfBatched()
B is the right hand matix from AX=B
"""
def cusolverSolveLU(handle,LU,P,B):
   rows_LU ,cols_LU = LU.shape
   rows_B, cols_B = B.shape
   B_gpu = gpuarray.to_gpu(B.astype('float32'))
   info_gpu = gpuarray.zeros(1, np.int32)

   status=libcusolver.cusolverDnSgetrs(
               handle, 'n', rows_LU, cols_B,
               int(LU.gpudata), cols_LU,
               int(P.gpudata), int(B_gpu.gpudata),
               cols_B, int(info_gpu.gpudata))
   print info_gpu   
   print status

handle= cusolverCreate() #get the initialization of cusolver
LU, P = cublasLUFactorization(...)
B = np.asarray(np.random.rand(3, 3), np.float32)
cusolverSolveLU(handle,LU,P,B)

The output:

[0]

3

What I'm doing wrong?


Solution

  • Here is a full working example of how to use the library; the result is validated against that obtained using numpy's built-in solver:

    import ctypes
    
    import numpy as np
    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    
    CUSOLVER_STATUS_SUCCESS = 0
    
    libcusolver = ctypes.cdll.LoadLibrary('libcusolver.so')
    
    libcusolver.cusolverDnCreate.restype = int
    libcusolver.cusolverDnCreate.argtypes = [ctypes.c_void_p]
    
    def cusolverDnCreate():
        handle = ctypes.c_void_p()
        status = libcusolver.cusolverDnCreate(ctypes.byref(handle))
        if status != CUSOLVER_STATUS_SUCCESS:
            raise RuntimeError('error!')
        return handle.value
    
    libcusolver.cusolverDnDestroy.restype = int
    libcusolver.cusolverDnDestroy.argtypes = [ctypes.c_void_p]
    
    def cusolverDnDestroy(handle):
        status = libcusolver.cusolverDnDestroy(handle)
        if status != CUSOLVER_STATUS_SUCCESS:
            raise RuntimeError('error!')
    
    libcusolver.cusolverDnSgetrf_bufferSize.restype = int
    libcusolver.cusolverDnSgetrf_bufferSize.argtypes = [ctypes.c_void_p,
                                                        ctypes.c_int,
                                                        ctypes.c_int,
                                                        ctypes.c_void_p,
                                                        ctypes.c_int,
                                                        ctypes.c_void_p]
    
    def cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork):
        status = libcusolver.cusolverDnSgetrf_bufferSize(handle, m, n,
                                                         int(A.gpudata),
                                                         n, ctypes.pointer(Lwork))
        if status != CUSOLVER_STATUS_SUCCESS:
            raise RuntimeError('error!')
    
    libcusolver.cusolverDnSgetrf.restype = int
    libcusolver.cusolverDnSgetrf.argtypes = [ctypes.c_void_p,
                                             ctypes.c_int,
                                             ctypes.c_int,
                                             ctypes.c_void_p,
                                             ctypes.c_int,
                                             ctypes.c_void_p,
                                             ctypes.c_void_p,
                                             ctypes.c_void_p]
    
    def cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo):
        status = libcusolver.cusolverDnSgetrf(handle, m, n, int(A.gpudata),
                                              lda,
                                              int(Workspace.gpudata),
                                              int(devIpiv.gpudata),
                                              int(devInfo.gpudata))
        if status != CUSOLVER_STATUS_SUCCESS:
            raise RuntimeError('error!')
    
    libcusolver.cusolverDnSgetrs.restype = int
    libcusolver.cusolverDnSgetrs.argtypes = [ctypes.c_void_p,
                                             ctypes.c_int,
                                             ctypes.c_int,
                                             ctypes.c_int,
                                             ctypes.c_void_p,
                                             ctypes.c_int,
                                             ctypes.c_void_p,
                                             ctypes.c_void_p,
                                             ctypes.c_int,
                                             ctypes.c_void_p]
    
    def cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo):
        status = libcusolver.cusolverDnSgetrs(handle, trans, n, nrhs,
                                              int(A.gpudata), lda,
                                              int(devIpiv.gpudata), int(B.gpudata),
                                              ldb, int(devInfo.gpudata))
        if status != CUSOLVER_STATUS_SUCCESS:
            raise RuntimeError('error!')
    
    if __name__ == '__main__':
        m = 3
        n = 3
        a = np.asarray(np.random.rand(m, n), np.float32)
        a_gpu = gpuarray.to_gpu(a.T.copy())
        lda = m
        b = np.asarray(np.random.rand(m, n), np.float32)
        b_gpu = gpuarray.to_gpu(b.T.copy())
        ldb = m
    
        handle = cusolverDnCreate()
        Lwork = ctypes.c_int()
    
        cusolverDnSgetrf_bufferSize(handle, m, n, a_gpu, lda, Lwork)
        Workspace = gpuarray.empty(Lwork.value, dtype=np.float32)
        devIpiv = gpuarray.zeros(min(m, n), dtype=np.int32)
        devInfo = gpuarray.zeros(1, dtype=np.int32)
    
        cusolverDnSgetrf(handle, m, n, a_gpu, lda, Workspace, devIpiv, devInfo)
        if devInfo.get()[0] != 0:
            raise RuntimeError('error!')
        CUBLAS_OP_N = 0
        nrhs = n
        devInfo = gpuarray.zeros(1, dtype=np.int32)
        cusolverDnSgetrs(handle, CUBLAS_OP_N, n, nrhs, a_gpu, lda, devIpiv, b_gpu, ldb, devInfo)
    
        x_cusolver = b_gpu.get().T
        cusolverDnDestroy(handle)
    
        x_numpy = np.linalg.solve(a, b)
        print np.allclose(x_numpy, x_cusolver)