Contradiction of cublasDgetrfBatched and cublasDtrsmBatched when to solve array of linear systems using cuBLAS

I have lots of dense linear systems which I want to solve in cuBLAS Batched format. So my plan is

use cublasDgetrfBatched for batched LU decomposition
Then use cublasDtrsmBatched for batched lower triangular and batched upper triangular part one by one.

The code is given as

 #include<stdio.h>
 #include<stdlib.h>
 #include<cuda_runtime.h> 
 #include<device_launch_parameters.h>
 #include<cublas_v2.h>

 const int N = 32;
 const int Nmatrices = N;

 __global__ void initiate(double *d_A, double *d_B)
    {
     int i = threadIdx.x;       int j = blockIdx.x;

     int id = j*N*N + i*N;      int idb = j*N + i;  

     for(int k = 0; k< N ; k++)
         {
        d_A[id + k] = 0.0;

        if(k == i-2)    d_A[id + k] = 1.0; 
        if(k == i-1)    d_A[id + k] = 2.0; 
        if(k == i)      d_A[id + k] = 8.0;
        if(k == i+1)    d_A[id + k] = 2.0;
        if(k == i+2)    d_A[id + k] = 1.0;  
        } 
     d_B[idb] = 8.0;   
    }


int main()
 {
    cublasHandle_t handle;      cublasSafeCall(cublasCreate(&handle));

// Allocate device space for the input matrices
  double *d_A_sys; cudaMalloc((void**)&d_A_sys, N*N*Nmatrices*sizeof(double));
  double *d_B_sys; cudaMalloc((void**)&d_B_sys, N*Nmatrices  *sizeof(double));

// Allocate host space for the solution
  double *h_B_sys = (double *)malloc(N*Nmatrices*sizeof(double));

// kernel for initiat d_A_sys and d_B_sys
  initiate<<<Nmatrices, N>>>(d_A_sys, d_B_sys);

//Creating the array of pointers needed as input/output to the batched getrf
  double **h_A_pointers = (double **)malloc(Nmatrices*sizeof(double *));
  for (int i = 0; i < Nmatrices; i++) h_A_pointers[i] = d_A_sys + i*N*N;

  double **h_b_pointers = (double **)malloc(Nmatrices*sizeof(double *));
  for (int i = 0; i < Nmatrices; i++) h_B_pointers[i] = d_B_sys + i*N;

  double **d_A_pointers;
  cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));
  cudaMemcpy(d_A_pointers, h_A_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);


  double **d_b_pointers;
  cudaMalloc((void**)&d_b_pointers, Nmatrices*sizeof(double *));
  cudaMemcpy(d_b_pointers, h_b_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);

  int *d_InfoArrays; cudaMalloc((void**)&d_InfoArrays,  Nmatrices*sizeof(int));
  int *h_InfoArrays = (int *)malloc(Nmatrices*sizeof(int));

//Batched LU decomposition
   cublasDgetrfBatched(handle, N, d_A_pointers, N, NULL, d_InfoArrays, Nmatrices));

  //Batched Lower triangular part
  cublasDtrsmBatched(handle, 
                   CUBLAS_SIDE_LEFT, 
                   CUBLAS_FILL_MODE_LOWER,
                   CUBLAS_OP_N,
                   CUBLAS_DIAG_UNIT,
                   N,
                   N,
                   &alpha,
                   d_A_pointers,
                   N,
                   d_b_pointers,
                   N,
                   Nmatrices);

  //Batched Upper triangular part
  cublasDtrsmBatched(handle,
                   CUBLAS_SIDE_LEFT,
                   CUBLAS_FILL_MODE_UPPER,
                   CUBLAS_OP_N,
                   CUBLAS_DIAG_NON_UNIT,
                   N,
                   N,
                   &alpha,
                   d_A_pointers,
                   N,
                   d_b_pointers,
                   N,
                   Nmatrices);

  cudaMemcpy(h_B_sys, d_B_sys, N*Nmatrices*sizeof(double), cudaMemcpyDeviceToHost);
  printf("Print out the solutions \n");

  cublasDestroy(handle);
  gpuErrchk(cudaDeviceReset());
  return 0;
 }

cublasDgetrfBatched and cublasDtrsmBatched demand d_A_pointers should be in double type but when I execute, the later one giving me compiling error like this see the pic:

How to overcome the problem, any help?

Solution

You can solve the const correctness issue by doing something like this:

const double **d_A_pointers;
cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));

....

//Batched LU decomposition
cublasDgetrfBatched(handle, N, const_cast<double**>(d_A_pointers), N, NULL, d_InfoArrays, Nmatrices);

i.e. cast away the constness in the cublasDgetrfBatched

The other thing you obviously have wrong in the posted code is the dimensions of the inputs in the cublasDtrsmBatched calls. I believe you want something like:

//Batched Lower triangular part
cublasDtrsmBatched(handle, 
        CUBLAS_SIDE_LEFT, 
        CUBLAS_FILL_MODE_LOWER,
        CUBLAS_OP_N,
        CUBLAS_DIAG_UNIT,
        N,
        1,
        &alpha,
        d_A_pointers,
        N,
        d_b_pointers,
        N,
        Nmatrices);

i.e. the input size of the RHS matrix for your example is Nx1, not NxN (you are not solving N RHS problems per factorised matrix, only one).

There are possibly other problems in your code (note that CUBLAS, like most reference BLAS implementations requires inputs in column major ordering by default), and the code you posted in your question doesn't actually compile for other reasons, so it is impossible to say more for sure.