ILGPU kernel giving incorrect output

I have taken the kernel code from ILGPU samples for multiplying two matrices in tiled form and wrote the a program to multiply the following matrices:

 a = |1  2  3  4|
     |5  6  7  8|

     | 9  10|
 b = |11  12|
     |13  14|
     |15  16|

According to WolframAlpha

{{1, 2, 3, 4}, {5, 6, 7, 8}} . {{9, 10}, {11, 12}, {13, 14}, {15, 16}}

gives the following output:

    130   140
    322   348

However, my source code gives the following output:

Result of matrix multiplication:
  31   34
 111  122

Where does my mistake lie given that I am using exactly the same kernel and tile-size?

Source Code

using ILGPU;
using ILGPU.Runtime;
using System;

public static class Program
{
    // Define tile size for matrix multiplication (2x2 tiles for simplicity)
    // This determines how many elements will be loaded into shared memory at a time.
    const int TILE_SIZE = 2;

    // Kernel function to perform tiled matrix multiplication on the GPU
    static void MatrixMultiplyTiledKernel(
            ArrayView2D<int, Stride2D.DenseX> aView, // Input matrix A in GPU memory
            ArrayView2D<int, Stride2D.DenseX> bView, // Input matrix B in GPU memory
            ArrayView2D<int, Stride2D.DenseX> cView) // Output matrix C in GPU memory
    {
        // Get the global thread index in the 2D grid
        Index2D global = Grid.GlobalIndex.XY;

        // Get the thread indices within the thread group (local indices)
        int x = Group.IdxX; // Local row index within the tile
        int y = Group.IdxY; // Local column index within the tile

        // Allocate shared memory for storing tiles of matrix A and matrix B
        var aTile = SharedMemory.Allocate2D<int, Stride2D.DenseX>(
            new Index2D(TILE_SIZE, TILE_SIZE),
            new Stride2D.DenseX(TILE_SIZE)
        );
        var bTile = SharedMemory.Allocate2D<int, Stride2D.DenseX>(
            new Index2D(TILE_SIZE, TILE_SIZE),
            new Stride2D.DenseX(TILE_SIZE)
        );

        // Initialize the accumulation variable for the result of C[global.X, global.Y]
        var sum = 0;

        // Loop over the tiles of A and B matrices
        // The loop increments by TILE_SIZE to process one tile at a time
        for (var i = 0; i < aView.IntExtent.X; i += TILE_SIZE)
        {
            // Load the corresponding tile of A into shared memory
            if (global.X < aView.IntExtent.X && y + i < aView.IntExtent.Y)
                aTile[x, y] = aView[global.X, y + i];
            else
                aTile[x, y] = 0; // Pad with zeros if out of bounds

            // Load the corresponding tile of B into shared memory
            if (x + i < bView.IntExtent.X && global.Y < bView.IntExtent.Y)
                bTile[x, y] = bView[x + i, global.Y];
            else
                bTile[x, y] = 0; // Pad with zeros if out of bounds

            // Ensure all threads in the thread group have finished loading tiles
            Group.Barrier();

            // Perform computation on the tiles
            for (var k = 0; k < TILE_SIZE; k++)
            {
                // Multiply elements of A and B tiles and accumulate the result
                sum += aTile[new Index2D(x, k)] * bTile[new Index2D(k, y)];
            }

            // Synchronize threads before loading the next tile
            Group.Barrier();
        }

        // Write the computed result to the output matrix C
        if (global.X < cView.IntExtent.X && global.Y < cView.IntExtent.Y)
        {
            cView[global] = sum; // Store the result in the appropriate position
        }
    }

    static void Main()
    {
        // Create an ILGPU context (manages devices and resources)
        using (var context = Context.CreateDefault())
        {
            // Select the preferred accelerator (GPU or CPU)
            using (var accelerator = context.GetPreferredDevice(preferCPU: true).CreateAccelerator(context))
            {
                try
                {
                    // Initialize sample input matrices (4x4)
                    int[,] a = {
                        { 1, 2, 3, 4},
                        { 5, 6, 7, 8}
                    };
                    int[,] b = {
                        { 9, 10},
                        { 11, 12},
                        { 13, 14},
                        { 15, 16}
                    };

                    int m = a.GetLength(0);
                    int ka = a.GetLength(1);
                    int kb = b.GetLength(0);
                    int n = b.GetLength(1);

                    int[,] hostResult = new int[m, n];

                    // Define the group size (number of threads per tile) and number of groups
                    Index2D groupSize = new Index2D(TILE_SIZE, TILE_SIZE); // Threads per group (block)
                    Index2D numGroups = new Index2D((m + TILE_SIZE - 1) / TILE_SIZE, (n + TILE_SIZE - 1) / TILE_SIZE); // Total number of thread groups

                    // Allocate device memory for input matrices A, B, and output matrix C
                    MemoryBuffer2D<int, Stride2D.DenseX> aBuffer = accelerator.Allocate2DDenseX<int>(new Index2D(m, ka));
                    MemoryBuffer2D<int, Stride2D.DenseX> bBuffer = accelerator.Allocate2DDenseX<int>(new Index2D(kb, n));
                    MemoryBuffer2D<int, Stride2D.DenseX> cBuffer = accelerator.Allocate2DDenseX<int>(new Index2D(m, n));

                    try
                    {
                        // Copy input matrices from host (CPU) to device (GPU) memory
                        aBuffer.CopyFromCPU(a);
                        bBuffer.CopyFromCPU(b);

                        // Load and precompile the kernel function
                        var loadedKernel = accelerator.LoadStreamKernel<
                            ArrayView2D<int, Stride2D.DenseX>,
                            ArrayView2D<int, Stride2D.DenseX>,
                            ArrayView2D<int, Stride2D.DenseX>>(MatrixMultiplyTiledKernel);

                        // Launch the kernel function on the GPU
                        // Specify the number of thread groups and threads per group
                        loadedKernel((numGroups, groupSize), aBuffer, bBuffer, cBuffer);

                        // Wait for the GPU to complete execution
                        accelerator.Synchronize();

                        // Retrieve the result matrix from GPU memory back to host memory
                        cBuffer.CopyToCPU(hostResult);

                        // Print the result matrix
                        Console.WriteLine("Result of matrix multiplication:");
                        for (int i = 0; i < m; i++)
                        {
                            for (int j = 0; j < n; j++)
                            {
                                Console.Write($"{hostResult[i, j],4} ");
                            }
                            Console.WriteLine();
                        }
                    }
                    finally
                    {
                        // Dispose of GPU resources to free up memory
                        aBuffer.Dispose();
                        bBuffer.Dispose();
                        cBuffer.Dispose();
                    }
                }
                finally
                {
                    // Dispose of the accelerator and context
                    accelerator.Dispose();
                }
            }
        }

        // Wait for user input before closing the console
        Console.ReadLine();
    }
}

Solution

I have never used ILGPU before and have no way to try debugging the code you posted, but from first principles alone, this:

    // Loop over the tiles of A and B matrices
    // The loop increments by TILE_SIZE to process one tile at a time
    for (var i = 0; i < aView.IntExtent.X; i += TILE_SIZE)
    {                   ^^^^^^^^^^^^^^^^^

looks wrong. If the tiled loop is to run over the length of the inner products which form the final product (which it would if it were to produce the correct result), surely that loop should be

    // Loop over the tiles of A and B matrices
    // The loop increments by TILE_SIZE to process one tile at a time
    for (var i = 0; i < aView.IntExtent.Y; i += TILE_SIZE)
    { 
         …

This mistake should cause the dot product to only be performed over two rather than four of the elements of the inner dimension and produce the result you are seeing.