First I wrote the following listing which multiplies two matrices.
A =B= [[1 2 3 4],
[5 6 7 8],
[9 10 11 12],
[13 14 15 16]]
A*B = [[90 100 110 120],
[202 228 254 280],
[314 356 398 440],
[426 484 542 600]]
#include <cstdarg>
#include <fstream>
#include <iomanip>
#include <iostream>
using namespace std;
const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;
const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;
const int ROWS3 = ROWS1;
const int COLS3 = COLS2;
const int TILE_ROW_SIZE = 2;
const int TILE_COL_SIZE = 2;
#define IDX(tile_size, tile_i, relative_i) (tile_size * tile_i + relative_i)
void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
int tile_row_size, int tile_col_size,
int cols1, int rows1, int cols2) {
for (int tile_i = 0; tile_i < tile_col_size; tile_i++) {//x
for (int tile_j = 0; tile_j < tile_row_size; tile_j++) {//y
for (int tile_r = 0; tile_r < tile_col_size; tile_r++) {//x
for (int cell_r = 0; cell_r < cols1; cell_r++) {//x
for (int cell_i = 0; cell_i < rows1; cell_i++) {//y
for (int cell_j = 0; cell_j < cols2; cell_j++) {//x
int r = IDX(TILE_COL_SIZE, tile_r, cell_r);
int i = IDX(TILE_ROW_SIZE, tile_i, cell_i);
int j = IDX(TILE_COL_SIZE, tile_j, cell_j);
C[i * COLS3 + j] += A[i * COLS1 + r] * B[r * COLS2 + j];
}
}
}
}
}
}
}
void printMatrix(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
printf("%d ", mat[i * cc + j]);
}
printf("\n");
}
printf("\n");
}
void allocateMatrix(int *&a, int rows, int cols) {
a = new int[rows * cols];
}
void freeMatrix(int *a) {
delete[] a;
}
void initMatrix(int *mat, int rr, int cc) {
int init = 1;
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = init++;
}
}
}
void initMatrixZero(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = 0;
}
}
}
int main() {
int *A, *B, *C;
allocateMatrix(A, ROWS1, COLS1);
initMatrix(A, ROWS1, COLS1);
allocateMatrix(B, ROWS2, COLS2);
initMatrix(B, ROWS2, COLS2);
allocateMatrix(C, ROWS3, COLS3);
initMatrixZero(C, ROWS3, COLS3);
MultiplyAsSumOuterProductOfVectors(A, B, C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1 / TILE_COL_SIZE, ROWS1 / TILE_ROW_SIZE, COLS2 / TILE_COL_SIZE);
printMatrix(C, ROWS3, COLS3);
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
return 0;
}
Then I translated that into the following CUDA program.
However, the output seems to be incorrect:
66 116 86 136
146 276 198 328
226 436 310 520
306 596 422 712
Can you help me find the bug in the CUDA Kernel?
#include <iostream>
#include <iomanip>
using namespace std;
const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;
const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;
const int ROWS3 = ROWS1;
const int COLS3 = COLS2;
const int TILE_ROW_SIZE = 2;//32;
const int TILE_COL_SIZE = 2;//32;
#define IDX(tile_size, tile_i, relative_i) (tile_size * tile_i + relative_i)
__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
int tile_row_size, int tile_col_size,
int cols1, int rows1, int cols2)
{
int tile_i = blockIdx.y;
int tile_j = blockIdx.x;
int cell_i = threadIdx.y;
int cell_j = threadIdx.x;
for (int tile_r = 0; tile_r < tile_col_size; tile_r++)
{
int r = IDX(TILE_COL_SIZE, tile_r, cell_j);
__shared__ int subA[TILE_ROW_SIZE][TILE_COL_SIZE];
__shared__ int subB[TILE_ROW_SIZE][TILE_COL_SIZE];
subA[cell_i][cell_j] = A[IDX(cols1, (tile_i * TILE_ROW_SIZE + cell_i), r)];
subB[cell_i][cell_j] = B[IDX(cols2, r, (tile_j * TILE_COL_SIZE + cell_j))];
__syncthreads();
for (int cell_r = 0; cell_r < TILE_ROW_SIZE; cell_r++)
{
int c_i = tile_i * TILE_ROW_SIZE + cell_i;
int c_j = tile_j * TILE_COL_SIZE + cell_j;
if (c_i < rows1 && c_j < cols2)
{
C[IDX(COLS3, c_i, c_j)] += subA[cell_i][cell_r] * subB[cell_r][cell_j];
}
}
__syncthreads();
}
}
void printMatrix(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
cout << setw(6) << mat[i * cc + j] << " ";
}
cout << endl;
}
cout << endl;
}
void allocateMatrix(int *&a, int rows, int cols) {
a = new int[rows * cols];
}
void freeMatrix(int *a) {
delete[] a;
}
void initMatrix(int *mat, int rr, int cc) {
int init = 1;
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = init++;
}
}
}
void initMatrixZero(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = 0;
}
}
}
int main() {
int *A, *B, *C;
int *d_A, *d_B, *d_C;
allocateMatrix(A, ROWS1, COLS1);
initMatrix(A, ROWS1, COLS1);
allocateMatrix(B, ROWS2, COLS2);
initMatrix(B, ROWS2, COLS2);
allocateMatrix(C, ROWS3, COLS3);
initMatrixZero(C, ROWS3, COLS3);
// Allocate device memory
cudaMalloc((void **)&d_A, ROWS1 * COLS1 * sizeof(int));
cudaMalloc((void **)&d_B, ROWS2 * COLS2 * sizeof(int));
cudaMalloc((void **)&d_C, ROWS3 * COLS3 * sizeof(int));
// Copy input matrices from host to device
cudaMemcpy(d_A, A, ROWS1 * COLS1 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, ROWS2 * COLS2 * sizeof(int), cudaMemcpyHostToDevice);
// Set grid and block dimensions
dim3 gridSize(COLS3 / TILE_COL_SIZE, ROWS3 / TILE_ROW_SIZE);
dim3 blockSize(TILE_COL_SIZE, TILE_ROW_SIZE);
// Launch the kernel
MultiplyAsSumOuterProductOfVectors<<<gridSize, blockSize>>>(d_A, d_B, d_C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1, ROWS1, COLS2);
// Copy result matrix from device to host
cudaMemcpy(C, d_C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyDeviceToHost);
// Print the result matrix
cout << "Result Matrix:" << endl;
printMatrix(C, ROWS3, COLS3);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free host memory
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
return 0;
}
There are at least several issues. While considering these you may wish to refer to the shared memory tiled matrix multiply example in the programming guide:
You are not initializing the C-matrix device memory. This is necessary because cudaMalloc
does not set an area to zero, and the only thing your kernel does is add to C.
You are not calculating the tile movement iterations correctly:
for (int tile_r = 0; tile_r < tile_col_size; tile_r++)
it happens to be OK for this case because 4/2 = 2. However in the general case, you want to divide the total number of rows by the size of the tile, as is the case in the programming guide example.
It's not a problem here, per se, but I believe you can make your index calculation macro more "robust" by using additional parenthesis:
#define IDX(tile_size, tile_i, relative_i) ((tile_size) * (tile_i) + (relative_i))
this allows you to use various expressions in your code without having to be as careful about parameter association.
Your tile indexing calculations are not correct. I'm not going to try to decipher what you were thinking. However the basic idea is that we want the A matrix tile to stride horizontally (along a row) and the B matrix tile to stride vertically (along a column).
The following code has these issues addressed and seems to work for me:
# cat t5.cu
#include <iostream>
#include <iomanip>
using namespace std;
const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;
const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;
const int ROWS3 = ROWS1;
const int COLS3 = COLS2;
const int TILE_ROW_SIZE = 2;//32;
const int TILE_COL_SIZE = 2;//32;
#define IDX(tile_size, tile_i, relative_i) ((tile_size) * (tile_i) + (relative_i))
__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
int tile_row_size, int tile_col_size,
int cols1, int rows1, int cols2)
{
int tile_i = blockIdx.y;
int tile_j = blockIdx.x;
int cell_i = threadIdx.y;
int cell_j = threadIdx.x;
for (int tile_r = 0; tile_r < cols2/tile_col_size; tile_r++)
{
__shared__ int subA[TILE_ROW_SIZE][TILE_COL_SIZE];
__shared__ int subB[TILE_ROW_SIZE][TILE_COL_SIZE];
// make A tile stride along the row so tile_r appears in column indexing
subA[cell_i][cell_j] = A[IDX(cols1, tile_i*TILE_ROW_SIZE+cell_i, tile_r*TILE_COL_SIZE+cell_j)];
// make B tile stride along the column so tile_r appears in row indexing
subB[cell_i][cell_j] = B[IDX(cols2, tile_r*TILE_ROW_SIZE+cell_i, tile_j*TILE_COL_SIZE+cell_j)];
__syncthreads();
for (int cell_r = 0; cell_r < TILE_ROW_SIZE; cell_r++)
{
int c_i = tile_i * TILE_ROW_SIZE + cell_i;
int c_j = tile_j * TILE_COL_SIZE + cell_j;
if (c_i < rows1 && c_j < cols2)
{
C[IDX(COLS3, c_i, c_j)] += subA[cell_i][cell_r] * subB[cell_r][cell_j];
}
}
__syncthreads();
}
}
void printMatrix(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
cout << setw(6) << mat[i * cc + j] << " ";
}
cout << endl;
}
cout << endl;
}
void allocateMatrix(int *&a, int rows, int cols) {
a = new int[rows * cols];
}
void freeMatrix(int *a) {
delete[] a;
}
void initMatrix(int *mat, int rr, int cc) {
int init = 1;
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = init++;
}
}
}
void initMatrixZero(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = 0;
}
}
}
int main() {
int *A, *B, *C;
int *d_A, *d_B, *d_C;
allocateMatrix(A, ROWS1, COLS1);
initMatrix(A, ROWS1, COLS1);
allocateMatrix(B, ROWS2, COLS2);
initMatrix(B, ROWS2, COLS2);
allocateMatrix(C, ROWS3, COLS3);
initMatrixZero(C, ROWS3, COLS3);
// Allocate device memory
cudaMalloc((void **)&d_A, ROWS1 * COLS1 * sizeof(int));
cudaMalloc((void **)&d_B, ROWS2 * COLS2 * sizeof(int));
cudaMalloc((void **)&d_C, ROWS3 * COLS3 * sizeof(int));
// Copy input matrices from host to device
cudaMemcpy(d_A, A, ROWS1 * COLS1 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, ROWS2 * COLS2 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyHostToDevice);
// Set grid and block dimensions
dim3 gridSize(COLS3 / TILE_COL_SIZE, ROWS3 / TILE_ROW_SIZE);
dim3 blockSize(TILE_COL_SIZE, TILE_ROW_SIZE);
// Launch the kernel
MultiplyAsSumOuterProductOfVectors<<<gridSize, blockSize>>>(d_A, d_B, d_C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1, ROWS1, COLS2);
// Copy result matrix from device to host
cudaMemcpy(C, d_C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyDeviceToHost);
// Print the result matrix
cout << "Result Matrix:" << endl;
printMatrix(C, ROWS3, COLS3);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free host memory
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
return 0;
}
# nvcc -o t5 t5.cu
# compute-sanitizer ./t5
========= COMPUTE-SANITIZER
Result Matrix:
90 100 110 120
202 228 254 280
314 356 398 440
426 484 542 600
========= ERROR SUMMARY: 0 errors
#
There are numerous limitations to this approach, such as generally needing square tiles, square matrices, and matrix dimensions that are whole-number-divisible by the tile dimension.