Access an matrix as its tranpose in tiled matrix mutliplication in CUDA

I'm currently experimenting with CUDA and i came across this kernel from an answer for matrix multiplication: https://stackoverflow.com/a/18856054/7867026

I want instead of doing A*B to do A_Transpose*A but without saving A_Transpose (only matrix A as an input to kernel). I have to properly set the indexes but I'm confused by this matrix representation. Any help would be appreciated.

Solution

most of what you need is here and here.

In the first link it is identified that AxAT involves taking inner products of rows of matrix A, and similarly ATxA will involve taking inner products of columns of matrix A. Also note the symmetry statement. In the second link (scroll down from that point a bit in the programming guide) you will find a complete tiled matrix multiply. You just need to index into both tiles by column.

Here is a worked example, using the code from the SO answer you linked:

$ cat t1654.cu
#include <iostream>
#include <cstdio>
#include <cstdlib>

const int TILE_DIM = 32;
template <typename T>
__global__ void ATA(const T * __restrict__  A, T * __restrict__  C, int ARows, int ACols)
{
    T CValue = 0;

    int Row = blockIdx.y*TILE_DIM + threadIdx.y;
    int Col = blockIdx.x*TILE_DIM + threadIdx.x;

    __shared__ T As[TILE_DIM][TILE_DIM];
    __shared__ T Bs[TILE_DIM][TILE_DIM];

    for (int k = 0; k < (TILE_DIM + ARows - 1)/TILE_DIM; k++) {

         if (k*TILE_DIM + threadIdx.y < ARows && blockIdx.y*blockDim.y+threadIdx.x < ACols)
             As[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + blockIdx.y*blockDim.y+threadIdx.x];
         else
             As[threadIdx.y][threadIdx.x] = 0.0;

         if (k*TILE_DIM + threadIdx.y < ARows && Col < ACols)
             Bs[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + Col];
         else
             Bs[threadIdx.y][threadIdx.x] = 0.0;

         __syncthreads();

         for (int n = 0; n < TILE_DIM; ++n)
             CValue += As[n][threadIdx.y] * Bs[n][threadIdx.x];

         __syncthreads();
    }

    if (Row < ACols && Col < ACols)
        C[((blockIdx.y * blockDim.y + threadIdx.y)*ACols) +
           (blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;
}

template <typename T>
__global__ void transpose_naive(const T * __restrict__ in, T * __restrict__ out, const int dim){
  int col = threadIdx.x+blockDim.x*blockIdx.x;
  int row = threadIdx.y+blockDim.y*blockIdx.y;
  if ((col < dim) && (row < dim)) out[col*dim+row] = in[row*dim+col];
}

template <typename T>
__global__ void mm_naive(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int rowA, const int colA, const int colB){
  int col = threadIdx.x+blockDim.x*blockIdx.x;
  int row = threadIdx.y+blockDim.y*blockIdx.y;
  if ((row < rowA) && (col < colB)){
    T Cval = 0;
    for (int i = 0; i < colA; i++) Cval += A[row*colA+i]*B[i*colB+col];
    C[row*colB+col] = Cval;}
}


typedef float mt;
int main(){

  mt *d_A, *d_B, *d_C, *h_A, *h_C, *h_C1;
  int m = 64;
  int n = 64;
  h_A  = new mt[m*n];
  h_C  = new mt[n*n];
  h_C1 = new mt[n*n];
  cudaMalloc(&d_A, m*n*sizeof(d_A[0]));
  cudaMalloc(&d_B, m*n*sizeof(d_A[0]));
  cudaMalloc(&d_C, n*n*sizeof(d_C[0]));
  // test 1
  for (int i = 0; i < m; i++)
    for (int j = 0; j < n; j++)
      h_A[i*n+j] = (i==j)?1.0f:0.0f;
  cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
  dim3 block(TILE_DIM, TILE_DIM);
  dim3 grid((n+block.x-1)/block.x, (n+block.y-1)/block.y);
  ATA<<<grid,block>>>(d_A, d_C, m, n);
  cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
  for (int i = 0; i < n; i++){
    for (int j = 0; j < n; j++)
      std::cout << h_C[i*n+j] << " ";
    std::cout << std::endl;}
  std::cout << std::endl;
#endif
  // test 2
  for (int i = 0; i < m; i++)
    for (int j = 0; j < n; j++)
      h_A[i*n+j] = rand()%10;
  cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
  ATA<<<grid,block>>>(d_A, d_C, m, n);
  cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
  for (int i = 0; i < n; i++){
    for (int j = 0; j < n; j++)
      std::cout << h_C[i*n+j] << " ";
    std::cout << std::endl;}
  std::cout << std::endl;
#endif
  transpose_naive<<<grid,block>>>(d_A, d_B, n);
  mm_naive<<<grid,block>>>(d_B, d_A, d_C, n, n, n);
  cudaMemcpy(h_C1, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
  for (int i = 0; i < n; i++){
    for (int j = 0; j < n; j++)
      std::cout << h_C1[i*n+j] << " ";
    std::cout << std::endl;}
  std::cout << std::endl;
#endif
  for (int i = 0; i < n*n; i++) if (h_C[i] != h_C1[i]) {std::cout << "mismatch at: " << i << " was: " << h_C[i] << " should be: " << h_C1[i] << std::endl; return 0;}
}
$ nvcc -o t1654 t1654.cu
$ cuda-memcheck ./t1654
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

Note that loading the Bs tile is identical in both cases. The main changes are in loading the As tile, and also note the indexing change when computing Cvalue. These changes are necessary to index in both cases by column.

There may still be bugs. I have not tested the non-square case, nor have I tested the case where the matrix size is not a multiple of block size. Furthermore I've taken no advantage of the symmetry in the output. However this should help with the indexing.