Search code examples
c++image-processingcudanvprof

CUDA concurrent kernel launch not working


I'm writing a CUDA program for image processing. Same kernel "processOneChannel" will be launched for RGB channels.

Below I try to specify streams for the three kernel launches so they can be processed concurrently. But nvprof says they are still launched one after another...

There are two other kernels before and after these three, and I don't want them to run concurrently.

Basically I want the following: seperateChannels --> processOneChannel(x3) --> recombineChannels

Please advice what I did wrong..

void kernelLauncher(const ushort4 * const h_inputImageRGBA, ushort4 * const d_inputImageRGBA,
                        ushort4* const d_outputImageRGBA, const size_t numRows, const size_t numCols,
                        unsigned short *d_redProcessed, 
                        unsigned short *d_greenProcessed, 
                        unsigned short *d_blueProcessed,
                        unsigned short *d_prand)
{
    int MAXTHREADSx = 512;
    int MAXTHREADSy = 1; 
    int nBlockX = numCols / MAXTHREADSx + 1;
    int nBlockY = numRows / MAXTHREADSy + 1;

  const dim3 blockSize(MAXTHREADSx,MAXTHREADSy,1);

  const dim3 gridSize(nBlockX,nBlockY,1);

  // cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

  int nstreams = 5;
  cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));

  for (int i = 0; i < nstreams; i++)
  {
      checkCudaErrors(cudaStreamCreateWithFlags(&(streams[i]),cudaStreamNonBlocking));
  }

  separateChannels<<<gridSize,blockSize>>>(d_inputImageRGBA, 
                                          (int)numRows, 
                                          (int)numCols, 
                                          d_red, 
                                          d_green, 
                                          d_blue);
  cudaDeviceSynchronize(); 

  checkCudaErrors(cudaGetLastError());

    processOneChannel<<<gridSize,blockSize,0,streams[0]>>>(d_red,
                                                          d_redProcessed,
                                                          (int)numRows,(int)numCols,
                                                          d_filter,d_prand);

    processOneChannel<<<gridSize,blockSize,0,streams[1]>>>(d_green,
                                                          d_greenProcessed,
                                                          (int)numRows,(int)numCols,
                                                          d_filter,d_prand);

    processOneChannel<<<gridSize,blockSize,0,streams[2]>>>(d_blue,
                                                          d_blueProcessed,
                                                          (int)numRows,(int)numCols,
                                                          d_filter,d_prand);
  cudaDeviceSynchronize(); 
    checkCudaErrors(cudaGetLastError());

  recombineChannels<<<gridSize, blockSize>>>(d_redProcessed,
                                             d_greenProcessed,
                                             d_blueProcessed,
                                             d_outputImageRGBA,
                                             numRows,
                                             numCols);
      for (int i = 0; i < nstreams; i++)
    {
        cudaStreamDestroy(streams[i]);
    }

    free(streams);
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}

Here's nvprof gpu trace output. Note the memcpy before the kernel launches are to pass filter data for the processing, so they cannot run in concurrency with kernel launches.

==10001== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
1.02428s  2.2400us                    -               -         -         -         -  28.125MB   1e+04GB/s  GeForce GT 750M         1        13  [CUDA memset]
1.02855s  18.501ms                    -               -         -         -         -  28.125MB  1.4846GB/s  GeForce GT 750M         1        13  [CUDA memcpy HtoD]
1.21959s  1.1371ms                    -               -         -         -         -  1.7580MB  1.5098GB/s  GeForce GT 750M         1        13  [CUDA memcpy HtoD]
1.22083s  1.3440us                    -               -         -         -         -  7.0313MB   5e+03GB/s  GeForce GT 750M         1        13  [CUDA memset]
1.22164s  1.3440us                    -               -         -         -         -  7.0313MB   5e+03GB/s  GeForce GT 750M         1        13  [CUDA memset]
1.22243s  3.6480us                    -               -         -         -         -  7.0313MB   2e+03GB/s  GeForce GT 750M         1        13  [CUDA memset]
1.22349s  10.240us                    -               -         -         -         -  8.0000KB  762.94MB/s  GeForce GT 750M         1        13  [CUDA memcpy HtoD]
1.22351s  6.6021ms           (6 1441 1)       (512 1 1)        12        0B        0B         -           -  GeForce GT 750M         1        13  separateChannels(...) [123]
1.23019s  10.661ms           (6 1441 1)       (512 1 1)        36      192B        0B         -           -  GeForce GT 750M         1        14  processOneChannel(...) [133]
1.24085s  10.518ms           (6 1441 1)       (512 1 1)        36      192B        0B         -           -  GeForce GT 750M         1        15  processOneChannel(...) [141]
1.25137s  10.779ms           (6 1441 1)       (512 1 1)        36      192B        0B         -           -  GeForce GT 750M         1        16  processOneChannel(...) [149]
1.26372s  5.7810ms           (6 1441 1)       (512 1 1)        15        0B        0B         -           -  GeForce GT 750M         1        13  recombineChannels(...) [159]
1.26970s  19.859ms                    -               -         -         -         -  28.125MB  1.3831GB/s  GeForce GT 750M         1        13  [CUDA memcpy DtoH]

Here's CMakeList.txt where I passed -default-stream per-thread to nvcc

cmake_minimum_required(VERSION 2.6 FATAL_ERROR)

find_package(OpenCV REQUIRED)
find_package(CUDA REQUIRED)

set(
    CUDA_NVCC_FLAGS
    ${CUDA_NVCC_FLAGS};
     -default-stream per-thread
)

file( GLOB  hdr *.hpp *.h )
file( GLOB  cu  *.cu)

SET (My_files main.cpp)

# Project Executable
CUDA_ADD_EXECUTABLE(My ${My_files} ${hdr} ${cu})
target_link_libraries(My ${OpenCV_LIBS})

Solution

  • Each kernel is launching 6*1441 which is over 8000 blocks, of 512 threads each. That is filling the machine, preventing blocks from subsequent kernel launches from executing.

    The machine has a capacity. The maximum instantaneous capacity in blocks is equal to the number of SMs in your GPU multiplied by the maximum number of blocks per SM, both of which are specifications that you can retrieve with the deviceQuery app. When you fill it up, it cannot process more blocks until some of the already running blocks have retired. This process will continue for the first kernel launch until most of the blocks have retired. Then the second kernel will start executing.