My Lab instructor supplied us with the following source code and told us to convert it into a CUDA program.
We have to implement matrix multiplication for GPU and do some tests regarding elapsed time.
void Multiply(float* A, float* B, float* C, int N)
{
// B is already transposed
int loc_a, loc_b, loc_c;
printf("In Multiply\n");
for (int i =0; i< N; i++) {
for (int j = 0; j<N;j++) {
loc_c = i*N+j;
loc_a = i*N;
loc_b = j*N;
C[loc_c] = 0.0f;
for (int k=0;k<N;k++) {
C[loc_c] += A[loc_a]*B[loc_b];
loc_a++;
loc_b++;
}
}
}
}
I am a little confused about this source code.
Why does B have to be already transposed to multiply two matrices?
Full Source Code
#ifndef CUDA_LAB_MATRIX_H
#define CUDA_LAB_MATRIX_H
#include <stdio.h>
#include <stdlib.h>
#include <chrono>
#include <iostream>
#include <cmath>
#include <cstdarg> // for va_list, va_start, va_arg, va_end
void CreateMatrix(float* &A, int count, ...) {
A = (float*)malloc(count * sizeof(float));
va_list args;
va_start(args, count); // start the variable argument list
for (int i = 0; i < count; i++) {
A[i] = va_arg(args, int); // get the next argument from the list
}
va_end(args); // end the variable argument list
}
void Transpose(float *A, float* At, int N){
for (int i = 0; i<N;i++ ) {
int loc = i*N;
int loc_t;
for (int j=0;j<N;j++) {
loc_t = j*N+i;
At[loc_t] = A[loc];
loc++;
}
}
}
void Multiply(float* A, float* B, float* C, int N)
{
C = (float*)malloc(N * sizeof(float));
// B is transposed
int loc_a, loc_b, loc_c;
printf("In Multiply\n");
for (int i =0; i< N; i++) {
for (int j = 0; j<N;j++) {
loc_c = i*N+j;
loc_a = i*N;
loc_b = j*N;
C[loc_c] = 0.0f;
for (int k=0;k<N;k++) {
float temp = A[loc_a]*B[loc_b];
C[loc_c] += temp;
loc_a++;
loc_b++;
}
}
}
}
void PrintMat(float* A, int row, int ext_row, int col, int ext_col, int N)
{
int cur_row;
int loc;
cur_row = row;
for (int i = 0; i< ext_row; i++)
{
loc = cur_row*N +col;
for (int j=0; j< ext_col;j++)
{
printf("%f ",A[loc+j]);
}
printf("\n");
cur_row++;
}
}
void CompareMatrices(float* A, float *B, int N)
{
int count =0;
float Sum=0.0f;
float diff;
int loc =0;
for (int i =0;i<N;i++)
{
for (int j=0;j<N;j++)
{
if (A[loc]!=B[loc])
{
Sum += fabs(A[loc]-B[loc]);
count++;
}
loc++;
}
}
printf("Difference: %f\n",Sum);
printf("Count: %d\n",count);
}
#endif //CUDA_LAB_MATRIX_H
It's a classic demonstration of the power of cache locality to show that results can usually be computed much faster by calculating the matrix product M x N^T instead of M x N. This is because in calculating M x N, the elements of N in the multiply operation are a full row apart, but in N^T they are contiguous so more can fit into cache.
It's an interesting question though, whether this logic stands up to the case where a GPU for example, running thousands of concurrent threads benefits in the same way as GPUs tend to have much less cache memory for a start and when that is shared across thousands of threads, there is even greater chances of cache misses. Perhaps this is the question you have been tasked to investigate.