I am stuck at a small problem. I've got to solve a linear System A * x = b
.
The matrix A
gets decomposed by an LU-factorization (LAPACK
). As result I get the factorized Matrix and the pivotarray. After that I want to solve the two linear Systems: U * x = y
and L * y = b
on the GPU with *cublasDtrsm*
. But because of the row interchanges from dgetrf
in LAPACK
I would have to pass the pivot array to cublas
. But the *cublasDtrsm*
-function don't offers something for this. Without the pivot array I get wrong results.
I already searched for disabling pivoting in LAPACK
, but regarding to stability it's not possible. Is there any hint how to solve a linear Equation system with LU-factorization?
If you wanted to use this particular approach (cublas trsm after LAPACK getrf), I believe you should be able to use cublas trsm with the L,U output of LAPACK by rearranging your b vector (or matrix) to match the rearrangement order that LAPACK performed during pivoting. I believe this order is given in the formula for ipiv
in the LAPACK documentation:
IPIV
IPIV is INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
Here's a sample code that demonstrates the idea for a simple 3x3 test case with a single RHS vector:
$ cat t853.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void LU_device(float *src_d, int n, int *pivot)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = n;
float *A[] = { src_d };
float **A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
cudacall(cudaMemcpy(pivot,P,n*batchSize*sizeof(int),cudaMemcpyDeviceToHost));
#ifdef DEBUG_PRINT
for (int qq = 0; qq < n*batchSize; qq++) {printf("pivot[%d] = %d\n", qq, pivot[qq]); }
#endif
if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P); cudaFree(INFO); cudaFree(A_d); cublasDestroy(handle);
}
void LU(float* src, float* L, float *U, int n, int *pivot)
{
float *src_d;
cudacall(cudaMalloc<float>(&src_d, n*n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n*n * sizeof(float),cudaMemcpyHostToDevice));
LU_device(src_d,n,pivot);
cudacall(cudaMemcpy(L,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudacall(cudaMemcpy(U,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i ++){
for (int j = 0; j < i; j++) L[i*n+j] = 0.0;
for (int j = i+1; j < n; j++) U[i*n+j] = 0.0;}
cudaFree(src_d);
}
void rearrange(float *vec, int *pivot, int n, int dir){
#define DIR_FORWARD 0
#define DIR_REVERSE 1
#define SWAP(x,y) {float swaptmp=(*(y)); (*(y))=(*(x)); (*(x))=swaptmp;}
if (dir == DIR_FORWARD)
for (int i = 0; i < n; i++) SWAP((vec+i),(vec+pivot[i]-1))
else
for (int i = n-1; i >= 0; i--) SWAP((vec+i),(vec+pivot[i]-1))
}
void TRSM(float *A, float *x, float *b, int n, cublasFillMode_t uplo, cublasDiagType_t diagt ){
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
float *A_d, *b_d;
cudacall(cudaMalloc<float>(&A_d, n*n * sizeof(float)));
cudacall(cudaMalloc<float>(&b_d, n * sizeof(float)));
cudacall(cudaMemcpy(b_d, b, n*sizeof(float), cudaMemcpyHostToDevice));
cudacall(cudaMemcpy(A_d, A, n*n*sizeof(float), cudaMemcpyHostToDevice));
const float alpha = 1.0f;
cublascall(cublasStrsm(handle, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, diagt, n, 1, &alpha, A_d, n, b_d, n));
cudacall(cudaMemcpy(x, b_d, n*sizeof(float), cudaMemcpyDeviceToHost));
cudaFree(A_d); cudaFree(b_d); cublasDestroy(handle);
}
void test_solve()
{
// solve Ax=b
// 1. Perform LU on A
// 2. using pivot sequence, rearrange b -> b'
// 3. perform TRSM on Ly=b'
// 4. perform TRSM on Ux=y
// A = |0 1 4 |
// |3 3 9 |
// |4 10 16|
// x = |1|
// |2|
// |3|
// b = |14|
// |36|
// |72|
const int n = 3;
// has 3,2,3 pivot order
float A_col_major[n*n] = { 0, 3, 4,
1, 3, 10,
4, 9, 16 };
float b1[n] = {14, 36, 72};
/* another example - has 3,3,3 pivot order
float A_transpose[n*n] = { 0, 1, 4,
3, 3, 9,
4, 10, 16 };
float b2[n] = {18, 37, 70};
*/
float result_x[n];
int pivot[n];
float L[n*n];
float U[n*n];
float y[n];
//Select matrix by setting "a"
float *a = A_col_major;
float *b = b1;
printf("Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",a[i*n+j]);
printf("\n");
}
printf("\n\n");
// 1. LU on A
LU(a,L,U,n,pivot);
#ifdef DEBUG_PRINT
printf("L:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",L[i*n+j]);
printf("\n");
}
printf("\n\n");
printf("U:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",U[i*n+j]);
printf("\n");
}
printf("\n\n");
#endif
// 2. Rearrange b
rearrange(b,pivot,n,DIR_FORWARD);
#ifdef DEBUG_PRINT
for (int i = 0; i < n; i++) printf("b'[%d] = %f\n", i, b[i]);
#endif
// 3. TRSM on Ly=b
TRSM(L, y, b, n, CUBLAS_FILL_MODE_LOWER, CUBLAS_DIAG_UNIT);
// 4. TRSM on Ux=y
TRSM(U, result_x, y, n, CUBLAS_FILL_MODE_UPPER, CUBLAS_DIAG_NON_UNIT);
fprintf(stdout, "Solution:\n\n");
for(int i=0; i<n; i++)
{
printf("%f\n",result_x[i]);
}
}
int main()
{
test_solve();
return 0;
}
$ nvcc -o t853 t853.cu -lcublas
$ ./t853
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Solution:
1.000000
2.000000
3.000000
$
Note that for this simple test case I used cublas getrfBatched to do the matrix LU factorization, rather than LAPACK, but I think it should behave similarly to LAPACK.
Also note that I'm not intending to comment on the "best approaches for linear system solutions" but merely to explain how the approach you mapped out might be made to work.