Search code examples
cudamatrix-multiplication

Can you help me find the bug in the CUDA Kernel?


First I wrote the following listing which multiplies two matrices.

A =B= [[1 2 3 4],
       [5 6 7 8],
       [9 10 11 12],
       [13 14 15 16]]

A*B = [[90 100 110 120], 
       [202 228 254 280], 
       [314 356 398 440], 
       [426 484 542 600]] 
#include <cstdarg>
#include <fstream>
#include <iomanip>
#include <iostream>

using namespace std;

const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;

const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;

const int ROWS3 = ROWS1;
const int COLS3 = COLS2;

const int TILE_ROW_SIZE = 2;
const int TILE_COL_SIZE = 2;

#define IDX(tile_size, tile_i, relative_i) (tile_size * tile_i + relative_i)

void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
int tile_row_size, int tile_col_size,
int cols1, int rows1, int cols2) {
  for (int tile_i = 0; tile_i < tile_col_size; tile_i++) {//x
      for (int tile_j = 0; tile_j < tile_row_size; tile_j++) {//y
          for (int tile_r = 0; tile_r < tile_col_size; tile_r++) {//x
              for (int cell_r = 0; cell_r < cols1; cell_r++) {//x
                  for (int cell_i = 0; cell_i < rows1; cell_i++) {//y
                      for (int cell_j = 0; cell_j < cols2; cell_j++) {//x
                          int r = IDX(TILE_COL_SIZE, tile_r, cell_r);
                          int i = IDX(TILE_ROW_SIZE, tile_i, cell_i);
                          int j = IDX(TILE_COL_SIZE, tile_j, cell_j);
                          C[i * COLS3 + j] += A[i * COLS1 + r] * B[r * COLS2 + j];
                      }
                  }
              }
          }
      }
  }
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      printf("%d ", mat[i * cc + j]);
    }
    printf("\n");
  }
  printf("\n");
}

void allocateMatrix(int *&a, int rows, int cols) {
  a = new int[rows * cols];
}

void freeMatrix(int *a) {
  delete[] a;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

int main() {
  int *A, *B, *C;

  allocateMatrix(A, ROWS1, COLS1);
  initMatrix(A, ROWS1, COLS1);

  allocateMatrix(B, ROWS2, COLS2);
  initMatrix(B, ROWS2, COLS2);

  allocateMatrix(C, ROWS3, COLS3);
  initMatrixZero(C, ROWS3, COLS3);

  MultiplyAsSumOuterProductOfVectors(A, B, C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1 / TILE_COL_SIZE, ROWS1 / TILE_ROW_SIZE, COLS2 / TILE_COL_SIZE);

  printMatrix(C, ROWS3, COLS3);

  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

Then I translated that into the following CUDA program.

However, the output seems to be incorrect:

    66    116     86    136
   146    276    198    328
   226    436    310    520
   306    596    422    712

Can you help me find the bug in the CUDA Kernel?

#include <iostream>
#include <iomanip>

using namespace std;

const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;

const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;

const int ROWS3 = ROWS1;
const int COLS3 = COLS2;

const int TILE_ROW_SIZE = 2;//32;
const int TILE_COL_SIZE = 2;//32;

#define IDX(tile_size, tile_i, relative_i) (tile_size * tile_i + relative_i)


__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
  int tile_row_size, int tile_col_size,
  int cols1, int rows1, int cols2) 
{
  int tile_i = blockIdx.y;
  int tile_j = blockIdx.x;

  int cell_i = threadIdx.y;
  int cell_j = threadIdx.x;

  for (int tile_r = 0; tile_r < tile_col_size; tile_r++)
  {
    int r = IDX(TILE_COL_SIZE, tile_r, cell_j);

    __shared__ int subA[TILE_ROW_SIZE][TILE_COL_SIZE];
    __shared__ int subB[TILE_ROW_SIZE][TILE_COL_SIZE];

    subA[cell_i][cell_j] = A[IDX(cols1, (tile_i * TILE_ROW_SIZE + cell_i), r)];
    subB[cell_i][cell_j] = B[IDX(cols2, r, (tile_j * TILE_COL_SIZE + cell_j))];

    __syncthreads();

    for (int cell_r = 0; cell_r < TILE_ROW_SIZE; cell_r++) 
    {
      int c_i = tile_i * TILE_ROW_SIZE + cell_i;
      int c_j = tile_j * TILE_COL_SIZE + cell_j;
      
      if (c_i < rows1 && c_j < cols2)
      {
        C[IDX(COLS3, c_i, c_j)] += subA[cell_i][cell_r] * subB[cell_r][cell_j];
      }
    }

    __syncthreads();
  }
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      cout << setw(6) << mat[i * cc + j] << " ";
    }
    cout << endl;
  }
  cout << endl;
}

void allocateMatrix(int *&a, int rows, int cols) {
  a = new int[rows * cols];
}

void freeMatrix(int *a) {
  delete[] a;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

int main() {
  int *A, *B, *C;
  int *d_A, *d_B, *d_C;

  allocateMatrix(A, ROWS1, COLS1);
  initMatrix(A, ROWS1, COLS1);

  allocateMatrix(B, ROWS2, COLS2);
  initMatrix(B, ROWS2, COLS2);

  allocateMatrix(C, ROWS3, COLS3);
  initMatrixZero(C, ROWS3, COLS3);

  // Allocate device memory
  cudaMalloc((void **)&d_A, ROWS1 * COLS1 * sizeof(int));
  cudaMalloc((void **)&d_B, ROWS2 * COLS2 * sizeof(int));
  cudaMalloc((void **)&d_C, ROWS3 * COLS3 * sizeof(int));

  // Copy input matrices from host to device
  cudaMemcpy(d_A, A, ROWS1 * COLS1 * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, B, ROWS2 * COLS2 * sizeof(int), cudaMemcpyHostToDevice);

  // Set grid and block dimensions
  dim3 gridSize(COLS3 / TILE_COL_SIZE, ROWS3 / TILE_ROW_SIZE);
  dim3 blockSize(TILE_COL_SIZE, TILE_ROW_SIZE);

  // Launch the kernel
  MultiplyAsSumOuterProductOfVectors<<<gridSize, blockSize>>>(d_A, d_B, d_C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1, ROWS1, COLS2);

  // Copy result matrix from device to host
  cudaMemcpy(C, d_C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyDeviceToHost);

  // Print the result matrix
  cout << "Result Matrix:" << endl;
  printMatrix(C, ROWS3, COLS3);

  // Free device memory
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  // Free host memory
  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

Solution

  • There are at least several issues. While considering these you may wish to refer to the shared memory tiled matrix multiply example in the programming guide:

    • You are not initializing the C-matrix device memory. This is necessary because cudaMalloc does not set an area to zero, and the only thing your kernel does is add to C.

    • You are not calculating the tile movement iterations correctly:

      for (int tile_r = 0; tile_r < tile_col_size; tile_r++)
      

      it happens to be OK for this case because 4/2 = 2. However in the general case, you want to divide the total number of rows by the size of the tile, as is the case in the programming guide example.

    • It's not a problem here, per se, but I believe you can make your index calculation macro more "robust" by using additional parenthesis:

      #define IDX(tile_size, tile_i, relative_i) ((tile_size) * (tile_i) + (relative_i))
      

      this allows you to use various expressions in your code without having to be as careful about parameter association.

    • Your tile indexing calculations are not correct. I'm not going to try to decipher what you were thinking. However the basic idea is that we want the A matrix tile to stride horizontally (along a row) and the B matrix tile to stride vertically (along a column).

    The following code has these issues addressed and seems to work for me:

    # cat t5.cu
    #include <iostream>
    #include <iomanip>
    
    using namespace std;
    
    const int ROWS1 = 4;//1024;
    const int COLS1 = 4;//1024;
    
    const int ROWS2 = 4;//1024;
    const int COLS2 = 4;//1024;
    
    const int ROWS3 = ROWS1;
    const int COLS3 = COLS2;
    
    const int TILE_ROW_SIZE = 2;//32;
    const int TILE_COL_SIZE = 2;//32;
    
    #define IDX(tile_size, tile_i, relative_i) ((tile_size) * (tile_i) + (relative_i))
    
    
    __global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
      int tile_row_size, int tile_col_size,
      int cols1, int rows1, int cols2)
    {
      int tile_i = blockIdx.y;
      int tile_j = blockIdx.x;
    
      int cell_i = threadIdx.y;
      int cell_j = threadIdx.x;
    
      for (int tile_r = 0; tile_r < cols2/tile_col_size; tile_r++)
      {
    
        __shared__ int subA[TILE_ROW_SIZE][TILE_COL_SIZE];
        __shared__ int subB[TILE_ROW_SIZE][TILE_COL_SIZE];
    
        // make A tile stride along the row so tile_r appears in column indexing
        subA[cell_i][cell_j] = A[IDX(cols1, tile_i*TILE_ROW_SIZE+cell_i, tile_r*TILE_COL_SIZE+cell_j)];
        // make B tile stride along the column so tile_r appears in row indexing
        subB[cell_i][cell_j] = B[IDX(cols2, tile_r*TILE_ROW_SIZE+cell_i, tile_j*TILE_COL_SIZE+cell_j)];
    
        __syncthreads();
    
        for (int cell_r = 0; cell_r < TILE_ROW_SIZE; cell_r++)
            {
          int c_i = tile_i * TILE_ROW_SIZE + cell_i;
          int c_j = tile_j * TILE_COL_SIZE + cell_j;
    
          if (c_i < rows1 && c_j < cols2)
              {
            C[IDX(COLS3, c_i, c_j)] += subA[cell_i][cell_r] * subB[cell_r][cell_j];
          }
        }
    
        __syncthreads();
      }
    }
    
    void printMatrix(int *mat, int rr, int cc) {
      for (int i = 0; i < rr; i++) {
        for (int j = 0; j < cc; j++) {
          cout << setw(6) << mat[i * cc + j] << " ";
        }
        cout << endl;
      }
      cout << endl;
    }
    
    void allocateMatrix(int *&a, int rows, int cols) {
      a = new int[rows * cols];
    }
    
    void freeMatrix(int *a) {
      delete[] a;
    }
    
    void initMatrix(int *mat, int rr, int cc) {
      int init = 1;
      for (int i = 0; i < rr; i++) {
        for (int j = 0; j < cc; j++) {
          mat[i * cc + j] = init++;
        }
      }
    }
    
    void initMatrixZero(int *mat, int rr, int cc) {
      for (int i = 0; i < rr; i++) {
        for (int j = 0; j < cc; j++) {
          mat[i * cc + j] = 0;
        }
      }
    }
    
    int main() {
      int *A, *B, *C;
      int *d_A, *d_B, *d_C;
    
      allocateMatrix(A, ROWS1, COLS1);
      initMatrix(A, ROWS1, COLS1);
    
      allocateMatrix(B, ROWS2, COLS2);
      initMatrix(B, ROWS2, COLS2);
    
      allocateMatrix(C, ROWS3, COLS3);
      initMatrixZero(C, ROWS3, COLS3);
    
      // Allocate device memory
      cudaMalloc((void **)&d_A, ROWS1 * COLS1 * sizeof(int));
      cudaMalloc((void **)&d_B, ROWS2 * COLS2 * sizeof(int));
      cudaMalloc((void **)&d_C, ROWS3 * COLS3 * sizeof(int));
    
      // Copy input matrices from host to device
      cudaMemcpy(d_A, A, ROWS1 * COLS1 * sizeof(int), cudaMemcpyHostToDevice);
      cudaMemcpy(d_B, B, ROWS2 * COLS2 * sizeof(int), cudaMemcpyHostToDevice);
      cudaMemcpy(d_C, C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyHostToDevice);
    
      // Set grid and block dimensions
      dim3 gridSize(COLS3 / TILE_COL_SIZE, ROWS3 / TILE_ROW_SIZE);
      dim3 blockSize(TILE_COL_SIZE, TILE_ROW_SIZE);
    
      // Launch the kernel
      MultiplyAsSumOuterProductOfVectors<<<gridSize, blockSize>>>(d_A, d_B, d_C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1, ROWS1, COLS2);
    
      // Copy result matrix from device to host
      cudaMemcpy(C, d_C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyDeviceToHost);
    
      // Print the result matrix
      cout << "Result Matrix:" << endl;
      printMatrix(C, ROWS3, COLS3);
    
      // Free device memory
      cudaFree(d_A);
      cudaFree(d_B);
      cudaFree(d_C);
    
      // Free host memory
      freeMatrix(A);
      freeMatrix(B);
      freeMatrix(C);
    
      return 0;
    }
    
    # nvcc -o t5 t5.cu
    # compute-sanitizer ./t5
    ========= COMPUTE-SANITIZER
    Result Matrix:
        90    100    110    120
       202    228    254    280
       314    356    398    440
       426    484    542    600
    
    ========= ERROR SUMMARY: 0 errors
    #
    

    There are numerous limitations to this approach, such as generally needing square tiles, square matrices, and matrix dimensions that are whole-number-divisible by the tile dimension.