cuda matrix-multiplication

Can you help me find the bug in the CUDA Kernel?

First I wrote the following listing which multiplies two matrices.

A =B= [[1 2 3 4],
       [5 6 7 8],
       [9 10 11 12],
       [13 14 15 16]]

A*B = [[90 100 110 120], 
       [202 228 254 280], 
       [314 356 398 440], 
       [426 484 542 600]]

#include <cstdarg>
#include <fstream>
#include <iomanip>
#include <iostream>

using namespace std;

const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;

const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;

const int ROWS3 = ROWS1;
const int COLS3 = COLS2;

const int TILE_ROW_SIZE = 2;
const int TILE_COL_SIZE = 2;

#define IDX(tile_size, tile_i, relative_i) (tile_size * tile_i + relative_i)

void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
int tile_row_size, int tile_col_size,
int cols1, int rows1, int cols2) {
  for (int tile_i = 0; tile_i < tile_col_size; tile_i++) {//x
      for (int tile_j = 0; tile_j < tile_row_size; tile_j++) {//y
          for (int tile_r = 0; tile_r < tile_col_size; tile_r++) {//x
              for (int cell_r = 0; cell_r < cols1; cell_r++) {//x
                  for (int cell_i = 0; cell_i < rows1; cell_i++) {//y
                      for (int cell_j = 0; cell_j < cols2; cell_j++) {//x
                          int r = IDX(TILE_COL_SIZE, tile_r, cell_r);
                          int i = IDX(TILE_ROW_SIZE, tile_i, cell_i);
                          int j = IDX(TILE_COL_SIZE, tile_j, cell_j);
                          C[i * COLS3 + j] += A[i * COLS1 + r] * B[r * COLS2 + j];
                      }
                  }
              }
          }
      }
  }
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      printf("%d ", mat[i * cc + j]);
    }
    printf("\n");
  }
  printf("\n");
}

void allocateMatrix(int *&a, int rows, int cols) {
  a = new int[rows * cols];
}

void freeMatrix(int *a) {
  delete[] a;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

int main() {
  int *A, *B, *C;

  allocateMatrix(A, ROWS1, COLS1);
  initMatrix(A, ROWS1, COLS1);

  allocateMatrix(B, ROWS2, COLS2);
  initMatrix(B, ROWS2, COLS2);

  allocateMatrix(C, ROWS3, COLS3);
  initMatrixZero(C, ROWS3, COLS3);

  MultiplyAsSumOuterProductOfVectors(A, B, C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1 / TILE_COL_SIZE, ROWS1 / TILE_ROW_SIZE, COLS2 / TILE_COL_SIZE);

  printMatrix(C, ROWS3, COLS3);

  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

Then I translated that into the following CUDA program.

However, the output seems to be incorrect:

    66    116     86    136
   146    276    198    328
   226    436    310    520
   306    596    422    712

Can you help me find the bug in the CUDA Kernel?

#include <iostream>
#include <iomanip>

using namespace std;

const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;

const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;

const int ROWS3 = ROWS1;
const int COLS3 = COLS2;

const int TILE_ROW_SIZE = 2;//32;
const int TILE_COL_SIZE = 2;//32;

#define IDX(tile_size, tile_i, relative_i) (tile_size * tile_i + relative_i)


__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
  int tile_row_size, int tile_col_size,
  int cols1, int rows1, int cols2) 
{
  int tile_i = blockIdx.y;
  int tile_j = blockIdx.x;

  int cell_i = threadIdx.y;
  int cell_j = threadIdx.x;

  for (int tile_r = 0; tile_r < tile_col_size; tile_r++)
  {
    int r = IDX(TILE_COL_SIZE, tile_r, cell_j);

    __shared__ int subA[TILE_ROW_SIZE][TILE_COL_SIZE];
    __shared__ int subB[TILE_ROW_SIZE][TILE_COL_SIZE];

    subA[cell_i][cell_j] = A[IDX(cols1, (tile_i * TILE_ROW_SIZE + cell_i), r)];
    subB[cell_i][cell_j] = B[IDX(cols2, r, (tile_j * TILE_COL_SIZE + cell_j))];

    __syncthreads();

    for (int cell_r = 0; cell_r < TILE_ROW_SIZE; cell_r++) 
    {
      int c_i = tile_i * TILE_ROW_SIZE + cell_i;
      int c_j = tile_j * TILE_COL_SIZE + cell_j;
      
      if (c_i < rows1 && c_j < cols2)
      {
        C[IDX(COLS3, c_i, c_j)] += subA[cell_i][cell_r] * subB[cell_r][cell_j];
      }
    }

    __syncthreads();
  }
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      cout << setw(6) << mat[i * cc + j] << " ";
    }
    cout << endl;
  }
  cout << endl;
}

void allocateMatrix(int *&a, int rows, int cols) {
  a = new int[rows * cols];
}

void freeMatrix(int *a) {
  delete[] a;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

int main() {
  int *A, *B, *C;
  int *d_A, *d_B, *d_C;

  allocateMatrix(A, ROWS1, COLS1);
  initMatrix(A, ROWS1, COLS1);

  allocateMatrix(B, ROWS2, COLS2);
  initMatrix(B, ROWS2, COLS2);

  allocateMatrix(C, ROWS3, COLS3);
  initMatrixZero(C, ROWS3, COLS3);

  // Allocate device memory
  cudaMalloc((void **)&d_A, ROWS1 * COLS1 * sizeof(int));
  cudaMalloc((void **)&d_B, ROWS2 * COLS2 * sizeof(int));
  cudaMalloc((void **)&d_C, ROWS3 * COLS3 * sizeof(int));

  // Copy input matrices from host to device
  cudaMemcpy(d_A, A, ROWS1 * COLS1 * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, B, ROWS2 * COLS2 * sizeof(int), cudaMemcpyHostToDevice);

  // Set grid and block dimensions
  dim3 gridSize(COLS3 / TILE_COL_SIZE, ROWS3 / TILE_ROW_SIZE);
  dim3 blockSize(TILE_COL_SIZE, TILE_ROW_SIZE);

  // Launch the kernel
  MultiplyAsSumOuterProductOfVectors<<<gridSize, blockSize>>>(d_A, d_B, d_C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1, ROWS1, COLS2);

  // Copy result matrix from device to host
  cudaMemcpy(C, d_C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyDeviceToHost);

  // Print the result matrix
  cout << "Result Matrix:" << endl;
  printMatrix(C, ROWS3, COLS3);

  // Free device memory
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  // Free host memory
  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

Solution

There are at least several issues. While considering these you may wish to refer to the shared memory tiled matrix multiply example in the programming guide:

You are not initializing the C-matrix device memory. This is necessary because cudaMalloc does not set an area to zero, and the only thing your kernel does is add to C.
You are not calculating the tile movement iterations correctly:
```
for (int tile_r = 0; tile_r < tile_col_size; tile_r++)
```
it happens to be OK for this case because 4/2 = 2. However in the general case, you want to divide the total number of rows by the size of the tile, as is the case in the programming guide example.
It's not a problem here, per se, but I believe you can make your index calculation macro more "robust" by using additional parenthesis:
```
#define IDX(tile_size, tile_i, relative_i) ((tile_size) * (tile_i) + (relative_i))
```
this allows you to use various expressions in your code without having to be as careful about parameter association.
Your tile indexing calculations are not correct. I'm not going to try to decipher what you were thinking. However the basic idea is that we want the A matrix tile to stride horizontally (along a row) and the B matrix tile to stride vertically (along a column).

The following code has these issues addressed and seems to work for me:

# cat t5.cu
#include <iostream>
#include <iomanip>

using namespace std;

const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;

const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;

const int ROWS3 = ROWS1;
const int COLS3 = COLS2;

const int TILE_ROW_SIZE = 2;//32;
const int TILE_COL_SIZE = 2;//32;

#define IDX(tile_size, tile_i, relative_i) ((tile_size) * (tile_i) + (relative_i))


__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
  int tile_row_size, int tile_col_size,
  int cols1, int rows1, int cols2)
{
  int tile_i = blockIdx.y;
  int tile_j = blockIdx.x;

  int cell_i = threadIdx.y;
  int cell_j = threadIdx.x;

  for (int tile_r = 0; tile_r < cols2/tile_col_size; tile_r++)
  {

    __shared__ int subA[TILE_ROW_SIZE][TILE_COL_SIZE];
    __shared__ int subB[TILE_ROW_SIZE][TILE_COL_SIZE];

    // make A tile stride along the row so tile_r appears in column indexing
    subA[cell_i][cell_j] = A[IDX(cols1, tile_i*TILE_ROW_SIZE+cell_i, tile_r*TILE_COL_SIZE+cell_j)];
    // make B tile stride along the column so tile_r appears in row indexing
    subB[cell_i][cell_j] = B[IDX(cols2, tile_r*TILE_ROW_SIZE+cell_i, tile_j*TILE_COL_SIZE+cell_j)];

    __syncthreads();

    for (int cell_r = 0; cell_r < TILE_ROW_SIZE; cell_r++)
        {
      int c_i = tile_i * TILE_ROW_SIZE + cell_i;
      int c_j = tile_j * TILE_COL_SIZE + cell_j;

      if (c_i < rows1 && c_j < cols2)
          {
        C[IDX(COLS3, c_i, c_j)] += subA[cell_i][cell_r] * subB[cell_r][cell_j];
      }
    }

    __syncthreads();
  }
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      cout << setw(6) << mat[i * cc + j] << " ";
    }
    cout << endl;
  }
  cout << endl;
}

void allocateMatrix(int *&a, int rows, int cols) {
  a = new int[rows * cols];
}

void freeMatrix(int *a) {
  delete[] a;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

int main() {
  int *A, *B, *C;
  int *d_A, *d_B, *d_C;

  allocateMatrix(A, ROWS1, COLS1);
  initMatrix(A, ROWS1, COLS1);

  allocateMatrix(B, ROWS2, COLS2);
  initMatrix(B, ROWS2, COLS2);

  allocateMatrix(C, ROWS3, COLS3);
  initMatrixZero(C, ROWS3, COLS3);

  // Allocate device memory
  cudaMalloc((void **)&d_A, ROWS1 * COLS1 * sizeof(int));
  cudaMalloc((void **)&d_B, ROWS2 * COLS2 * sizeof(int));
  cudaMalloc((void **)&d_C, ROWS3 * COLS3 * sizeof(int));

  // Copy input matrices from host to device
  cudaMemcpy(d_A, A, ROWS1 * COLS1 * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, B, ROWS2 * COLS2 * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_C, C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyHostToDevice);

  // Set grid and block dimensions
  dim3 gridSize(COLS3 / TILE_COL_SIZE, ROWS3 / TILE_ROW_SIZE);
  dim3 blockSize(TILE_COL_SIZE, TILE_ROW_SIZE);

  // Launch the kernel
  MultiplyAsSumOuterProductOfVectors<<<gridSize, blockSize>>>(d_A, d_B, d_C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1, ROWS1, COLS2);

  // Copy result matrix from device to host
  cudaMemcpy(C, d_C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyDeviceToHost);

  // Print the result matrix
  cout << "Result Matrix:" << endl;
  printMatrix(C, ROWS3, COLS3);

  // Free device memory
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  // Free host memory
  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

# nvcc -o t5 t5.cu
# compute-sanitizer ./t5
========= COMPUTE-SANITIZER
Result Matrix:
    90    100    110    120
   202    228    254    280
   314    356    398    440
   426    484    542    600

========= ERROR SUMMARY: 0 errors
#

There are numerous limitations to this approach, such as generally needing square tiles, square matrices, and matrix dimensions that are whole-number-divisible by the tile dimension.