Search code examples
cudanvidiagpucusolver

Cholesky decomposition with CUDA


I am trying to implement Cholesky decomposition using the cuSOLVER library. I am a beginner CUDA programmer and I have always specified block-sizes and grid-sizes, but I am not able to find out how this can be set explicitly by the programmer with cuSOLVER functions.

Here is the documentation: http://docs.nvidia.com/cuda/cusolver/index.html#introduction

The QR decomposition is implemented using the cuSOLVER library (see the example here: http://docs.nvidia.com/cuda/cusolver/index.html#ormqr-example1) and even there the above two parameters are not set.

To summarize, I have the following questions

  • How can the parameters: block-size and grid-size can be set with the cuSOLVER library?
  • How is the same being done with the mentioned QR example code in the NVIDIA documentation?

Solution

  • Robert Crovella has already answered this question. Here, I'm just providing a full example showing how Cholesky decomposition can be easily performed using the potrf function provided by the cuSOLVER library.

    The Utilities.cu and Utilities.cuh files are mantained at this page and omitted here. The example implements the CPU as well as the GPU approach.

    #include "cuda_runtime.h"
    #include "device_launch_paraMeters.h"
    
    #include<iostream>
    #include <fstream>
    #include<iomanip>
    #include<stdlib.h>
    #include<stdio.h>
    #include<assert.h>
    
    #include <cusolverDn.h>
    #include <cublas_v2.h>
    #include <cuda_runtime_api.h>
    
    #include "Utilities.cuh"
    
    #define prec_save 10
    
    /******************************************/
    /* SET HERMITIAN POSITIVE DEFINITE MATRIX */
    /******************************************/
    // --- Credit to: https://math.stackexchange.com/questions/357980/how-to-generate-random-symmetric-positive-definite-matrices-using-matlab
    void setPDMatrix(double * __restrict h_A, const int N) {
    
        // --- Initialize random seed
        srand(time(NULL));
    
        double *h_A_temp = (double *)malloc(N * N * sizeof(double));
    
        for (int i = 0; i < N; i++)
            for (int j = 0; j < N; j++)
                h_A_temp[i * N + j] = (float)rand() / (float)RAND_MAX;
    
        for (int i = 0; i < N; i++)
            for (int j = 0; j < N; j++) 
                h_A[i * N + j] = 0.5 * (h_A_temp[i * N + j] + h_A_temp[j * N + i]);
    
        for (int i = 0; i < N; i++) h_A[i * N + i] = h_A[i * N + i] + N;
    
    }
    
    /************************************/
    /* SAVE REAL ARRAY FROM CPU TO FILE */
    /************************************/
    template <class T>
    void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
    
        std::ofstream outfile;
        outfile.open(filename);
        for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
        outfile.close();
    
    }
    
    /************************************/
    /* SAVE REAL ARRAY FROM GPU TO FILE */
    /************************************/
    template <class T>
    void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
    
        T *h_in = (T *)malloc(M * sizeof(T));
    
        gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
    
        std::ofstream outfile;
        outfile.open(filename);
        for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
        outfile.close();
    
    }
    
    /********/
    /* MAIN */
    /********/
    int main(){
    
        const int N = 1000;
    
        // --- CUDA solver initialization
        cusolverDnHandle_t solver_handle;
        cusolveSafeCall(cusolverDnCreate(&solver_handle));
    
        // --- CUBLAS initialization
        cublasHandle_t cublas_handle;
        cublasSafeCall(cublasCreate(&cublas_handle));
    
        /***********************/
        /* SETTING THE PROBLEM */
        /***********************/
        // --- Setting the host, N x N matrix
        double *h_A = (double *)malloc(N * N * sizeof(double));
        setPDMatrix(h_A, N);
    
        // --- Allocate device space for the input matrix 
        double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * sizeof(double)));
    
        // --- Move the relevant matrix from host to device
        gpuErrchk(cudaMemcpy(d_A, h_A, N * N * sizeof(double), cudaMemcpyHostToDevice));
    
        /****************************************/
        /* COMPUTING THE CHOLESKY DECOMPOSITION */
        /****************************************/
        // --- cuSOLVE input/output parameters/arrays
        int work_size = 0;
        int *devInfo;           gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
    
        // --- CUDA CHOLESKY initialization
        cusolveSafeCall(cusolverDnDpotrf_bufferSize(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, &work_size));
    
        // --- CUDA POTRF execution
        double *work;   gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
        cusolveSafeCall(cusolverDnDpotrf(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, work, work_size, devInfo));
        int devInfo_h = 0;  gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
        if (devInfo_h != 0) std::cout << "Unsuccessful potrf execution\n\n" << "devInfo = " << devInfo_h << "\n\n";
    
        // --- At this point, the lower triangular part of A contains the elements of L. 
        /***************************************/
        /* CHECKING THE CHOLESKY DECOMPOSITION */
        /***************************************/
        saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\h_A.txt", N * N);
        saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\d_A.txt", N * N);
    
        cusolveSafeCall(cusolverDnDestroy(solver_handle));
    
        return 0;
    
    }
    

    EDIT

    Cholesky decomposition requires that the relevant matrix is Hermitian and positive definite. Symmetric and positive definite matrices can be generated by the approach in How to generate random symmetric positive definite matrices using MATLAB?.

    The following Matlab code can be used for checking the results

    clear all
    close all
    clc
    
    warning off
    
    N = 1000;
    
    % --- Setting the problem solution
    x = ones(N, 1);
    
    load h_A.txt
    A = reshape(h_A, N, N);
    
    yMatlab = A * x;
    
    Lmatlab = chol(A, 'lower');
    
    xprime      = inv(Lmatlab) * yMatlab;
    xMatlab     = inv(Lmatlab') * xprime;
    
    fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
    
    load d_A.txt
    LCUDA = tril(reshape(d_A, N, N));
    
    fprintf('Percentage rms of Cholesky decompositions in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(Lmatlab - LCUDA).^2)) / sum(sum(abs(Lmatlab).^2))));
    
    load xCUDA.txt
    fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xCUDA - x).^2)) / sum(sum(abs(x).^2))));