cublasDtrsm after LU with pivoting

I am stuck at a small problem. I've got to solve a linear System A * x = b.

The matrix A gets decomposed by an LU-factorization (LAPACK). As result I get the factorized Matrix and the pivotarray. After that I want to solve the two linear Systems: U * x = y and L * y = b on the GPU with *cublasDtrsm*. But because of the row interchanges from dgetrf in LAPACK I would have to pass the pivot array to cublas. But the *cublasDtrsm*-function don't offers something for this. Without the pivot array I get wrong results.

I already searched for disabling pivoting in LAPACK, but regarding to stability it's not possible. Is there any hint how to solve a linear Equation system with LU-factorization?

Solution

If you wanted to use this particular approach (cublas trsm after LAPACK getrf), I believe you should be able to use cublas trsm with the L,U output of LAPACK by rearranging your b vector (or matrix) to match the rearrangement order that LAPACK performed during pivoting. I believe this order is given in the formula for ipiv in the LAPACK documentation:

IPIV
IPIV is INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).

Here's a sample code that demonstrates the idea for a simple 3x3 test case with a single RHS vector:

$ cat t853.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define cudacall(call)                                                                                                          \
    do                                                                                                                          \
    {                                                                                                                           \
        cudaError_t err = (call);                                                                                               \
        if(cudaSuccess != err)                                                                                                  \
        {                                                                                                                       \
            fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err));    \
            cudaDeviceReset();                                                                                                  \
            exit(EXIT_FAILURE);                                                                                                 \
        }                                                                                                                       \
    }                                                                                                                           \
    while (0)

#define cublascall(call)                                                                                        \
    do                                                                                                          \
    {                                                                                                           \
        cublasStatus_t status = (call);                                                                         \
        if(CUBLAS_STATUS_SUCCESS != status)                                                                     \
        {                                                                                                       \
            fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);     \
            cudaDeviceReset();                                                                                  \
            exit(EXIT_FAILURE);                                                                                 \
        }                                                                                                       \
                                                                                                                \
    }                                                                                                           \
    while(0)


void LU_device(float *src_d, int n, int *pivot)
{
    cublasHandle_t handle;
    cublascall(cublasCreate_v2(&handle));

    int batchSize = 1;

    int *P, *INFO;

    cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
    cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));

    int lda = n;

    float *A[] = { src_d };
    float **A_d;
    cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
    cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));

    cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));

    int INFOh = 0;
    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
    cudacall(cudaMemcpy(pivot,P,n*batchSize*sizeof(int),cudaMemcpyDeviceToHost));
#ifdef DEBUG_PRINT
    for (int qq = 0; qq < n*batchSize; qq++) {printf("pivot[%d] = %d\n", qq, pivot[qq]); }
#endif

    if(INFOh == n)
    {
        fprintf(stderr, "Factorization Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }
    cudaFree(P); cudaFree(INFO); cudaFree(A_d); cublasDestroy(handle);
}

void LU(float* src, float* L, float *U, int n, int *pivot)
{
    float *src_d;

    cudacall(cudaMalloc<float>(&src_d, n*n * sizeof(float)));
    cudacall(cudaMemcpy(src_d,src,n*n * sizeof(float),cudaMemcpyHostToDevice));

    LU_device(src_d,n,pivot);

    cudacall(cudaMemcpy(L,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
    cudacall(cudaMemcpy(U,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
    for (int i = 0; i < n; i ++){
      for (int j = 0; j < i; j++)   L[i*n+j] = 0.0;
      for (int j = i+1; j < n; j++) U[i*n+j] = 0.0;}

    cudaFree(src_d);
}

void rearrange(float *vec, int *pivot, int n, int dir){
#define DIR_FORWARD 0
#define DIR_REVERSE 1
#define SWAP(x,y) {float swaptmp=(*(y)); (*(y))=(*(x)); (*(x))=swaptmp;}
  if (dir == DIR_FORWARD)
    for (int i = 0; i < n; i++)    SWAP((vec+i),(vec+pivot[i]-1))
  else
    for (int i = n-1; i >= 0; i--) SWAP((vec+i),(vec+pivot[i]-1))
}


void TRSM(float *A, float *x, float *b, int n, cublasFillMode_t uplo, cublasDiagType_t diagt ){

    cublasHandle_t handle;
    cublascall(cublasCreate_v2(&handle));
    float *A_d, *b_d;
    cudacall(cudaMalloc<float>(&A_d, n*n * sizeof(float)));
    cudacall(cudaMalloc<float>(&b_d, n * sizeof(float)));
    cudacall(cudaMemcpy(b_d, b,   n*sizeof(float), cudaMemcpyHostToDevice));
    cudacall(cudaMemcpy(A_d, A, n*n*sizeof(float), cudaMemcpyHostToDevice));
    const float alpha = 1.0f;
    cublascall(cublasStrsm(handle, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, diagt, n, 1, &alpha, A_d, n, b_d, n));
    cudacall(cudaMemcpy(x, b_d, n*sizeof(float), cudaMemcpyDeviceToHost));
    cudaFree(A_d); cudaFree(b_d); cublasDestroy(handle);
}

void test_solve()
{
 // solve Ax=b
 // 1. Perform LU on A
 // 2. using pivot sequence, rearrange b -> b'
 // 3. perform TRSM on Ly=b'
 // 4. perform TRSM on Ux=y
 // A = |0 1  4 |
 //     |3 3  9 |
 //     |4 10 16|
 // x = |1|
 //     |2|
 //     |3|
 // b = |14|
 //     |36|
 //     |72|

    const int n = 3;

// has 3,2,3 pivot order
    float          A_col_major[n*n] = { 0, 3, 4,
                                        1, 3, 10,
                                        4, 9, 16 };
    float b1[n] = {14, 36, 72};
/* another example - has 3,3,3 pivot order
    float          A_transpose[n*n] = { 0, 1,  4,
                                        3, 3,  9,
                                        4, 10, 16 };
    float b2[n] = {18, 37, 70};
*/
    float result_x[n];
    int pivot[n];
    float L[n*n];
    float U[n*n];
    float y[n];

    //Select matrix by setting "a"
    float *a = A_col_major;
    float *b = b1;

    printf("Input:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            printf("%f\t",a[i*n+j]);
        printf("\n");
    }

    printf("\n\n");
// 1. LU on A
    LU(a,L,U,n,pivot);
#ifdef DEBUG_PRINT
    printf("L:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            printf("%f\t",L[i*n+j]);
        printf("\n");
    }

    printf("\n\n");
    printf("U:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            printf("%f\t",U[i*n+j]);
        printf("\n");
    }

    printf("\n\n");

#endif
// 2. Rearrange b
    rearrange(b,pivot,n,DIR_FORWARD);
#ifdef DEBUG_PRINT
   for (int i = 0; i < n; i++) printf("b'[%d] = %f\n", i, b[i]);
#endif
// 3. TRSM on Ly=b
    TRSM(L, y, b, n, CUBLAS_FILL_MODE_LOWER, CUBLAS_DIAG_UNIT);
// 4. TRSM on Ux=y
    TRSM(U, result_x, y, n, CUBLAS_FILL_MODE_UPPER, CUBLAS_DIAG_NON_UNIT);

    fprintf(stdout, "Solution:\n\n");
    for(int i=0; i<n; i++)
    {
            printf("%f\n",result_x[i]);
    }

}

int main()
{
    test_solve();

    return 0;
}

$ nvcc -o t853 t853.cu -lcublas
$ ./t853
Input:

0.000000        3.000000        4.000000
1.000000        3.000000        10.000000
4.000000        9.000000        16.000000


Solution:

1.000000
2.000000
3.000000
$

Note that for this simple test case I used cublas getrfBatched to do the matrix LU factorization, rather than LAPACK, but I think it should behave similarly to LAPACK.

Also note that I'm not intending to comment on the "best approaches for linear system solutions" but merely to explain how the approach you mapped out might be made to work.