Search code examples
c++matrix-multiplication

Why does B have to be transposed?


My Lab instructor supplied us with the following source code and told us to convert it into a CUDA program.

We have to implement matrix multiplication for GPU and do some tests regarding elapsed time.

void Multiply(float* A, float* B, float* C, int N)
{
    // B is already transposed
    int loc_a, loc_b, loc_c;
    printf("In Multiply\n");
    for (int i =0; i< N; i++) {
        for (int j = 0; j<N;j++) {
            loc_c = i*N+j;
            loc_a = i*N;
            loc_b = j*N;
            C[loc_c] = 0.0f;
            for (int k=0;k<N;k++) {
                C[loc_c] += A[loc_a]*B[loc_b];
                loc_a++;
                loc_b++;
            }
        }
    }
}

I am a little confused about this source code.

Why does B have to be already transposed to multiply two matrices?

Full Source Code

#ifndef CUDA_LAB_MATRIX_H
#define CUDA_LAB_MATRIX_H

#include <stdio.h>
#include <stdlib.h>
#include <chrono>
#include <iostream>
#include <cmath>

#include <cstdarg> // for va_list, va_start, va_arg, va_end

void CreateMatrix(float* &A, int count, ...) {
    A = (float*)malloc(count * sizeof(float));
    va_list args;
    va_start(args, count); // start the variable argument list

    for (int i = 0; i < count; i++) {
        A[i] = va_arg(args, int); // get the next argument from the list
    }

    va_end(args); // end the variable argument list
}

void Transpose(float *A, float* At, int N){
    for (int i = 0; i<N;i++ ) {
        int loc = i*N;
        int loc_t;
        for (int j=0;j<N;j++) {
            loc_t = j*N+i;
            At[loc_t] = A[loc];
            loc++;
        }
    }
}

void Multiply(float* A, float* B, float* C, int N)
{
    C = (float*)malloc(N * sizeof(float));
    // B is transposed
    int loc_a, loc_b, loc_c;
    printf("In Multiply\n");
    for (int i =0; i< N; i++) {
        for (int j = 0; j<N;j++) {
            loc_c = i*N+j;
            loc_a = i*N;
            loc_b = j*N;
            C[loc_c] = 0.0f;
            for (int k=0;k<N;k++) {
                float temp = A[loc_a]*B[loc_b];
                C[loc_c] += temp;
                loc_a++;
                loc_b++;
            }
        }
    }
}

void PrintMat(float* A, int row, int ext_row, int col, int ext_col, int N)
{
    int cur_row;
    int loc;
    cur_row = row;
    for (int i = 0; i< ext_row; i++)
    {
        loc = cur_row*N +col;
        for (int j=0; j< ext_col;j++)
        {
            printf("%f  ",A[loc+j]);
        }
        printf("\n");
        cur_row++;
    }
}

void CompareMatrices(float* A, float *B, int N)
{
    int count =0;
    float Sum=0.0f;
    float diff;
    int loc =0;

    for (int i =0;i<N;i++)
    {
        for (int j=0;j<N;j++)
        {
            if (A[loc]!=B[loc])
            {
                Sum += fabs(A[loc]-B[loc]);
                count++;
            }
            loc++;
        }
    }
    printf("Difference: %f\n",Sum);
    printf("Count: %d\n",count);
}

#endif //CUDA_LAB_MATRIX_H

Solution

  • It's a classic demonstration of the power of cache locality to show that results can usually be computed much faster by calculating the matrix product M x N^T instead of M x N. This is because in calculating M x N, the elements of N in the multiply operation are a full row apart, but in N^T they are contiguous so more can fit into cache.

    It's an interesting question though, whether this logic stands up to the case where a GPU for example, running thousands of concurrent threads benefits in the same way as GPUs tend to have much less cache memory for a start and when that is shared across thousands of threads, there is even greater chances of cache misses. Perhaps this is the question you have been tasked to investigate.