issues of cuBLAS performance on batched complex linear system solver

I am new to cuda and cuBlas, and recently I am trying to use batched cuBlas API to solve multiple systems of linear equations. Here's my code:

The size of the matrix is N, and the number of matrices (batch size) is numOfMat.

#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
#include <iostream>
#include <chrono>
#include <random>
#include <cuda.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
#include <cuComplex.h>      // deal with complex numbers
#include <cuda_profiler_api.h>
    
using namespace std::chrono;

#define N 6
#define numOfMat 500000
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main() {
    std::random_device device;
    std::mt19937 generator(device());
    std::uniform_real_distribution<double> distribution(1., 5.);
    high_resolution_clock::time_point t1;
    high_resolution_clock::time_point t2;
    double duration = 0;
    double duration_1 = 0;

    // step 1: cuda solver initialization
    cublasHandle_t cublas_handle;
    cublasCreate_v2(&cublas_handle);
    cublasStatus_t stat;

    int* PivotArray;
    int* infoArray;
    cudaError_t cudaStatUnified1 = cudaSuccess;
    cudaError_t cudaStatUnified2 = cudaSuccess;
    const cuDoubleComplex alpha = make_cuDoubleComplex(1.0f, 0.0f);
    cudaStatUnified1 = cudaMallocManaged(&PivotArray, N * numOfMat * sizeof(int));
    cudaStatUnified2 = cudaMallocManaged(&infoArray, numOfMat * sizeof(int));
    if ((cudaSuccess != cudaStatUnified1) || (cudaSuccess != cudaStatUnified2))
        std::cout <<"unified memory allocated unsuccessful!"<<std::endl;

    //ALLOCATE MEMORY - using unified memory
    cuDoubleComplex** h_A;
    cudaMallocManaged(&h_A, sizeof(cuDoubleComplex*) * numOfMat);
    cudaMallocManaged(&(h_A[0]), sizeof(cuDoubleComplex)*numOfMat*N*N);
    for (int nm = 1; nm < numOfMat; nm++)
        h_A[nm] = h_A[nm-1]+ N * N;

    cuDoubleComplex** h_b;
    cudaMallocManaged(&h_b, sizeof(cuDoubleComplex*) * numOfMat);
    cudaMallocManaged(&(h_b[0]), sizeof(cuDoubleComplex) * numOfMat * N);
    for (int nm = 1; nm < numOfMat; nm++)
        h_b[nm] = h_b[nm-1] + N;

    // FILL MATRICES
    for (int nm = 0; nm < numOfMat; nm++)
      for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
          h_A[nm][j * N + i] = make_cuDoubleComplex(distribution(generator), distribution(generator));

    // FILL COEFFICIENTS
    for (int nm = 0; nm < numOfMat; nm++)
      for (int i = 0; i < N; i++)
        h_b[nm][i] = make_cuDoubleComplex(distribution(generator), distribution(generator));

    t1 = high_resolution_clock::now();

    // step 2: Perform CUBLAS LU solver
    stat = cublasZgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numOfMat);
    if (stat != CUBLAS_STATUS_SUCCESS) printf ("-data download failed");
    gpuErrchk( cudaDeviceSynchronize() );
    // check if the input matrix is singular
    /*for (int i = 0; i < numOfMat; i++)
      if (infoArray[i] != 0) {
        fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
      }*/

    // step 3: INVERT UPPER AND LOWER TRIANGULAR MATRICES
    // --- Function solves the triangular linear system with multiple RHSs
    // --- Function overrides b as a result
    stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
    if (stat != CUBLAS_STATUS_SUCCESS) printf ("--data download failed");
    gpuErrchk( cudaDeviceSynchronize() );

    stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
    if (stat != CUBLAS_STATUS_SUCCESS) printf ("---data download failed");
    gpuErrchk( cudaDeviceSynchronize() );

    t2 = high_resolution_clock::now();
    duration = duration_cast<microseconds>(t2 - t1).count();
    std::cout<<duration<<std::endl;
}

The code works fine, but when I plot the computational time versus the number of matrices, the curve looks like this:

My question is: why does the computational time show linear to the number of matrices? Intuitively, the curve should look to be flat when the batch size is large in some extent. However, when the batch size reaches up to 500,000, the time still appears to be linear to the batch size.

How can it be? Is there any explanation behind such a circumstance?

Solution

I think you need to look more closely at your data. If I run a modification of your code on Google Colab (Tesla T4) I get this:

Which looks largely like your figure. But look more closely (log scales help):

You can clearly see that up to a certain point, the runtime is largely independent of the number of matrices (around 2^8 = 64), but then scaling is linear as sizes increase. That is the transition from being able to parallelize the workload to reaching parallel capacity and having to schedule many parallel groups of operations to execute the workload. You might infer that for this particular GPU, the GPU run out of parallel capacity at between 64 and 128 concurrent operations (The T4 has 40 SM, so it might well be 80 if an SM could accommodate 2 operations per SM concurrently), after which runtime scales with multiples of that limiting size.

This is completely normal behaviour for any parallel computation architecture I am familiar with.