Search code examples
cuda

How can I check the progress of matrix multiplication?


I need to show an intermediate progress of matrix multiplication.

for(unsigned int col=0; col<mtxSize; col++) {
         unsigned tmp = 0;
         for(unsigned int row=0; row<mtxSize; row++) {
             for(unsigned int idx=0; idx<mtxSize; idx++) {
                 tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row];
            }
             h_Rs[col*mtxSize+row] = tmp;
             tmp = 0;
             int rate_tmp = (col*mtxSize + (row+1))*100;
             // Maybe like this...
             fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize);
             fflush(stdout);
         }
}

In the case of the host code (using the CPU), it is very easy beacause it processes sequentially so we can check it easily.

But in the case of the GPU, which processes in parallel, what should I do?

Once the kernel is running, it does not return until the kernel execution finishes.

Thus, I can't check mid-data during the kernel execution time.

I think I need to use an asynchronous kernel call, but I do not know how.

Even if the asynchronous kernel call is used, to see all of the data into several blocks over processors, do I have to use the atomicAdd() (in other words, global memory access) function which includes some overhead?

I also want to know how in the case of CUDA.


Solution

  • Here is a code which demonstrates how to check progress from a matrix multiply kernel:

    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #define TIME_INC 100000000
    #define INCS 10
    #define USE_PROGRESS 1
    #define MAT_DIMX 4000
    #define MAT_DIMY MAT_DIMX
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    __global__ void mykernel(volatile int *data){
    
      unsigned long time;
      for (int i = 0; i < INCS; i++){
        atomicAdd((int *)data,1);
        __threadfence_system();
        time = clock64();
        while((clock64() - time)<TIME_INC) {};
        }
      printf("progress check finished\n");
    }
    
    __global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){
      unsigned int row = threadIdx.x+blockDim.x*blockIdx.x;
      unsigned int col = threadIdx.y+blockDim.y*blockIdx.y;
      if ((row < rowA) && (col < colB)){
        float temp = 0.0f;
        for (unsigned int k = 0; k < colA; k++)
          temp += a[(row*colA)+k] * b[(k*colB) + col];
        c[(row*colB)+col] = temp;
    #if USE_PROGRESS
        if (!(threadIdx.x || threadIdx.y)){
          atomicAdd((int *)progress, 1);
          __threadfence_system();
          }
    #endif
      }
    }
    
    int main(){
    // simple test to demonstrate reading progress data from kernel
      volatile int *d_data, *h_data;
      cudaSetDeviceFlags(cudaDeviceMapHost);
      cudaCheckErrors("cudaSetDeviceFlags error");
      cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
      cudaCheckErrors("cudaHostAlloc error");
      cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
      cudaCheckErrors("cudaHostGetDevicePointer error");
      *h_data = 0;
      printf("kernel starting\n");
      mykernel<<<1,1>>>(d_data);
      cudaCheckErrors("kernel fail");
      int value = 0;
      do{
        int value1 = *h_data;
        if (value1 > value){
           printf("h_data = %d\n", value1);
           value = value1;}}
        while (value < (INCS-1));
      cudaDeviceSynchronize();
      cudaCheckErrors("kernel fail 2");
    
    // now try matrix multiply with progress
    
      float *h_c, *d_a, *d_b, *d_c;
      h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float));
      if (h_c == NULL) {printf("malloc fail\n"); return 1;}
      cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float));
      cudaCheckErrors("cudaMalloc a fail");
      cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float));
      cudaCheckErrors("cudaMalloc b fail");
      cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float));
      cudaCheckErrors("cudaMalloc c fail");
    
      for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX;
      cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
      cudaCheckErrors("cudaMemcpy a fail");
      cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
      cudaCheckErrors("cudaMemcpy b fail");
    
      cudaEvent_t start, stop;
      cudaEventCreate(&start); cudaEventCreate(&stop);
      *h_data=0;
      dim3 block(16,16);
      dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y));
      printf("matrix multiply kernel starting\n");
      cudaEventRecord(start);
      matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
      cudaEventRecord(stop);
    #if USE_PROGRESS
      unsigned int num_blocks = grid.x*grid.y;
      float my_progress = 0.0f;
      value = 0;
      printf("Progress:\n");
      do{
        cudaEventQuery(stop);  // may help WDDM scenario
        int value1 = *h_data;
        float kern_progress = (float)value1/(float)num_blocks;
        if ((kern_progress - my_progress)> 0.1f) {
          printf("percent complete = %2.1f\n", (kern_progress*100));
          my_progress = kern_progress;}}
        while (my_progress < 0.9f);
      printf("\n");
    #endif
      cudaEventSynchronize(stop);
      cudaCheckErrors("event sync fail");
      float et;
      cudaEventElapsedTime(&et, start, stop);
      cudaCheckErrors("event elapsed time fail");
      cudaDeviceSynchronize();
      cudaCheckErrors("mat mult kernel fail");
      printf("matrix multiply finished.  elapsed time = %f milliseconds\n", et);
    
    
      return 0;
    }
    

    The code associated with the first kernel call is just to demonstrate the basic idea of having a kernel report it's progress back.

    The second part of the code shows a sample, naive matrix multiply on the GPU, with the GPU reporting it's progress back. I have included the ability to remove the progress check code via a preprocessor macro, as well as the ability to time the matrix multiply kernel. For the case I have here, there was no discernible difference in timing with or without the progress code. So while the progress reporting code probably does add some overhead, when compared to the scope of a reasonable sized matrix multiply kernel, it adds no significant time that I can see.

    Some other uses of signalling are discussed here