Search code examples
cmatrixcudamagma

cuda magma matrix-matrix addition kernel


I tried using similar format as magmablas_sgeadd_q kernel, however I am not getting proper outputs, moreover every time I run it, I get a different output. The code that I used is given below:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>

#define BLK_X 2
#define BLK_Y 1

__global__ void matrixAdd2( const float *dA, const float *dB, float *dC, int m, int n)
{
int ldda = m;
int lddb = m;

int ind = blockIdx.x*BLK_X + threadIdx.x;
int iby = blockIdx.y*BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind < m ) {
    dA += ind + iby*ldda;
    dB += ind + iby*lddb;
    if ( full ) 
    {
        // full block-column
        #pragma unroll
        for( int j=0; j < BLK_Y; ++j )
        {
            dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
            printf("A is %f, B is %f, C is %f  \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
        }
    }
    else 
    {
        // partial block-column
        for( int j=0; j < BLK_Y && iby+j < n; ++j )
        {
            dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
            printf("parital: A is %f, B is %f, C is %f  \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
        }
    }
}
}



int main ( void )
{

int m = 4; // a - mxn matrix
int n = 2; // b - mxn matrix

size_t size =  m * n * sizeof(float);


printf("Matrix addition of %d rows and %d columns \n", m, n);

// allocate matrices on the host

float *h_A = (float *)malloc(size); // a- mxn matrix on the host
float *h_B = (float *)malloc(size); // b- mxn matrix on the host
float *h_C = (float *)malloc(size); // b- mxn matrix on the host


// Initialize the host input matrixs
for (int i = 0; i < m; ++i)
{
    for (int j = 0; j < n ; j ++)
    {
        h_A[i*m+j] = rand()/(float)RAND_MAX;
        h_B[i*m+j] = rand()/(float)RAND_MAX;

    }
}

// Allocate the device input matrix A   
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device

// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);

// Allocate the device output matrix C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);

// Copy the host input matrixs A and B in host memory to the device input    matrixs in device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

// defining number of threads and blocks    
dim3 threads( BLK_X, 1 );
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y) );


// Launching kernel     
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n);

// Copy the device result matrix in device memory to the host result matrix in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

//print A matrix 
printf("Matrix A");
for (int i = 0; i < m; i++)
{
    for (int j = 0; j < n; j++)
   {
        printf(" %f", h_A[i*m+j]);

    }
    printf("\n");
}

// print B matrix if required
printf("Matrix B");
for (int i = 0; i < m; i++)
{
    for (int j = 0; j < n; j++)
    {

        printf(" %f", h_B[i*m+j]);

    }
    printf("\n");
}

//Error checkng
printf("Matrix C ");
for (int i = 0; i < m; i++)
{
    for (int j = 0; j < n; j++)
   {    
        printf("%f", h_C[i*m+j]);
        if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j] )
        { 
            flag = flag + 1;
        }
    }
    printf("\n");
}

if(flag==m*n)
{
printf("Test PASSED\n");
}


// Free device global memory
err = cudaFree(d_A);

err = cudaFree(d_B);

err = cudaFree(d_C);

// Free host memory
free(h_A);
free(h_B);
free(h_C);


err = cudaDeviceReset();
printf("Done\n");
return 0;

}

Output I got:

Matrix addition of 4 rows and 2 columns Copy input data from the host memory to the CUDA device CUDA kernel launch with 4 blocks of 2 threads Copy output data from the CUDA device to the host memory A is 0.000000, B is 0.364784, C is 0.364784 A is 0.000000, B is 0.952230, C is 0.952230 A is 0.000000, B is 0.000000, C is 0.000000 A is 0.000000, B is 0.000000, C is 0.000000 A is 0.840188, B is 0.394383, C is 1.234571 A is 0.783099, B is 0.798440, C is 1.581539 A is 0.911647, B is 0.197551, C is 1.109199 A is 0.335223, B is 0.768230, C is 1.103452

Matrix A

0.840188 0.783099 0.911647 0.335223 0.277775 0.477397 0.364784 0.952230

Matrix B

0.394383 0.798440 0.197551 0.768230 0.553970 0.628871 0.000000 0.000000

Matrix C

0.0000000.000000 0.0000000.000000 0.0000000.000000 0.0000000.000000

Let me know if you find something wrong with the code.

Thank you


Solution

  • There were two coding errors that I found:

    1. When you use this method to "bump" the base pointer for matrices dA and dB in your kernel, you must also do the same for the base pointer for matrix dC:

      if ( ind < m ) {
          dA += ind + iby*ldda;
          dB += ind + iby*lddb;
          dC += ind + iby*lddb;  // add this line
      
    2. Your host-code nested for loops are not indexing correctly. The outer loop is intended to index across the rows, and you have n rows, but you are allowing the outer loop to index across m rows:

      for (int i = 0; i < m; ++i)
      {
          for (int j = 0; j < n ; j ++)
      

      so then when you do the actual indexing calculation here:

          h_A[i*m+j] = rand()/(float)RAND_MAX;
      

      you are indexing out-of-bounds. (i*m exceeds the matrix size, for some values of i) This problem is repeated in all of your nested for-loops in host code. The fix is to reverse the m, n bounds on your i, j loops.

    The following code has those errors fixed (plus some additions for variable definitions you left out - err and flag are undefined in the code you have posted currently - which create compile errors.) It seems to run correctly and produce the correct result:

    $ cat t1213.cu
    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <cuda_runtime.h>
    
    #define BLK_X 2
    #define BLK_Y 1
    
    __global__ void matrixAdd2( const float *dA, const float *dB, float *dC, int m, int n)
    {
    int ldda = m;
    int lddb = m;
    
    int ind = blockIdx.x*BLK_X + threadIdx.x;
    int iby = blockIdx.y*BLK_Y;
    /* check if full block-column */
    bool full = (iby + BLK_Y <= n);
    /* do only rows inside matrix */
    if ( ind < m ) {
        dA += ind + iby*ldda;
        dB += ind + iby*lddb;
        dC += ind + iby*lddb;
        if ( full )
        {
            // full block-column
            #pragma unroll
            for( int j=0; j < BLK_Y; ++j )
            {
                dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
                printf("A is %f, B is %f, C is %f  \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
            }
        }
        else
        {
            // partial block-column
            for( int j=0; j < BLK_Y && iby+j < n; ++j )
            {
                dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
                printf("parital: A is %f, B is %f, C is %f  \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
            }
        }
    }
    }
    
    
    
    int main ( void )
    {
    
    int m = 4; // a - mxn matrix
    int n = 2; // b - mxn matrix
    
    size_t size =  m * n * sizeof(float);
    
    
    printf("Matrix addition of %d rows and %d columns \n", m, n);
    
    // allocate matrices on the host
    
    float *h_A = (float *)malloc(size); // a- mxn matrix on the host
    float *h_B = (float *)malloc(size); // b- mxn matrix on the host
    float *h_C = (float *)malloc(size); // b- mxn matrix on the host
    
    
    // Initialize the host input matrixs
    for (int i = 0; i < n; ++i)
    {
        for (int j = 0; j < m ; j ++)
        {
            h_A[i*m+j] = rand()/(float)RAND_MAX;
            h_B[i*m+j] = rand()/(float)RAND_MAX;
    
        }
    }
    
    // Allocate the device input matrix A
    float *d_A = NULL;
    cudaError_t err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device
    
    // Allocate the device input matrix B
    float *d_B = NULL;
    err = cudaMalloc((void **)&d_B, size);
    
    // Allocate the device output matrix C
    float *d_C = NULL;
    err = cudaMalloc((void **)&d_C, size);
    
    // Copy the host input matrixs A and B in host memory to the device input    matrixs in device memory
    printf("Copy input data from the host memory to the CUDA device\n");
    err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    
    err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    
    // defining number of threads and blocks
    dim3 threads( BLK_X, BLK_Y );
    dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y) );
    
    
    // Launching kernel
    matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n);
    
    // Copy the device result matrix in device memory to the host result matrix in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    
    //print A matrix
    printf("Matrix A");
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < m; j++)
       {
            printf(" %f", h_A[i*m+j]);
    
        }
        printf("\n");
    }
    
    // print B matrix if required
    printf("Matrix B");
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < m; j++)
        {
    
            printf(" %f", h_B[i*m+j]);
    
        }
        printf("\n");
    }
    int flag = 0;
    //Error checkng
    printf("Matrix C ");
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < m; j++)
       {
            printf("%f", h_C[i*m+j]);
            if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j] )
            {
                flag = flag + 1;
            }
        }
        printf("\n");
    }
    
    if(flag==m*n)
    {
    printf("Test PASSED\n");
    }
    
    
    // Free device global memory
    err = cudaFree(d_A);
    
    err = cudaFree(d_B);
    
    err = cudaFree(d_C);
    
    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);
    
    
    err = cudaDeviceReset();
    printf("Done\n");
    return 0;
    
    }
    $ nvcc -o t1213 t1213.cu
    $ cuda-memcheck ./t1213
    ========= CUDA-MEMCHECK
    Matrix addition of 4 rows and 2 columns
    Copy input data from the host memory to the CUDA device
    Copy output data from the CUDA device to the host memory
    A is 0.277775, B is 0.553970, C is 0.831745
    A is 0.477397, B is 0.628871, C is 1.106268
    A is 0.364784, B is 0.513401, C is 0.878185
    A is 0.952230, B is 0.916195, C is 1.868425
    A is 0.911647, B is 0.197551, C is 1.109199
    A is 0.335223, B is 0.768230, C is 1.103452
    A is 0.840188, B is 0.394383, C is 1.234571
    A is 0.783099, B is 0.798440, C is 1.581539
    Matrix A 0.840188 0.783099 0.911647 0.335223
     0.277775 0.477397 0.364784 0.952230
    Matrix B 0.394383 0.798440 0.197551 0.768230
     0.553970 0.628871 0.513401 0.916195
    Matrix C 1.2345711.5815391.1091991.103452
    0.8317451.1062680.8781851.868425
    Test PASSED
    Done
    ========= ERROR SUMMARY: 0 errors
    $