Based on my reading of the PyCUDA documentation, the samples and the book on CUDA by Kirk and Hwu, I have successfully implemented a CUDA C-based complex matrix multiplication program and have also written a version in PyCUDA. The C code produces the correct results, but the Python code doesn't.
To be clear, the Python code is simply taken from the samples (MatrixMulTiled) and has been modified to handle complex numbers using cuComplexFloat from "cuComplex.h". Before this modification, it was correctly multiplying real-valued matrices.
So I can't figure out the error. The Python code is
# attempt to do matrix multiplication for complex numbers
import pycuda.autoinit
from pycuda import driver, compiler, gpuarray, tools
import numpy as np
from time import *
kernel_code_template = """
#include <cuComplex.h>
__global__ void MatrixMulKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *C)
{
const uint wA = %(MATRIX_SIZE)s;
const uint wB = %(MATRIX_SIZE)s;
// Block index
const uint bx = blockIdx.x;
const uint by = blockIdx.y;
// Thread index
const uint tx = threadIdx.x;
const uint ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
const uint aBegin = wA * %(BLOCK_SIZE)s * by;
// Index of the last sub-matrix of A processed by the block
const uint aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
const uint aStep = %(BLOCK_SIZE)s;
// Index of the first sub-matrix of B processed by the block
const int bBegin = %(BLOCK_SIZE)s * bx;
// Step size used to iterate through the sub-matrcies of B
const uint bStep = %(BLOCK_SIZE)s * wB;
// The element of the block sub-matrix that is computed by the thread
cuFloatComplex Csub = make_cuFloatComplex(0,0);
// Loop over all the sub-matrices of A and B required to compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)
{
// Shared memory for the sub-matrix of A
__shared__ cuFloatComplex As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
// Shared memory for the sub-matrix of B
__shared__ cuFloatComplex Bs[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
// Load the matrices from global memory to shared memory;
// each thread loads one element of each matrix
As[ty][tx] = make_cuFloatComplex(cuCrealf(A[a + wA*ty + tx]),cuCimagf(A[a + wA*ty + tx]));
Bs[ty][tx] = make_cuFloatComplex(cuCrealf(B[b + wB*ty + tx]),cuCimagf(B[b + wA*ty + tx]));
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrcies together
// each thread computes one element of the block sub-matrix
for(int k = 0; k < %(BLOCK_SIZE)s; ++k)
{
Csub = cuCaddf(Csub,cuCmulf(As[ty][k],Bs[k][tx]));
}
// Synchronize to make sure that the preceding computation
// is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to global memory
// each thread writes one element
const uint c = wB * %(BLOCK_SIZE)s * by + %(BLOCK_SIZE)s * bx;
C[c + wB*ty + tx] = make_cuFloatComplex(cuCrealf(Csub), cuCimagf(Csub));
}
"""
MATRIX_SIZE = 4
TILE_SIZE = 2
BLOCK_SIZE = TILE_SIZE
a_cpu = np.zeros(shape=(MATRIX_SIZE,MATRIX_SIZE)).astype(np.complex)
b_cpu = np.zeros(shape=(MATRIX_SIZE,MATRIX_SIZE)).astype(np.complex)
a_cpu[:,:] = 1 + 1j*0
b_cpu[:,:] = 1 + 1j*2
# compute reference on the CPU to verify GPU computation
t1 = time()
c_cpu = np.dot(a_cpu, b_cpu)
t2 = time()
t_cpu = t2-t1
# transfer host (CPU) memory to device (GPU) memory
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
# create empty gpuarry for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.complex)
# get the kernel code from the template
# by specifying the constant MATRIX_SIZE
kernel_code = kernel_code_template % {
'MATRIX_SIZE': MATRIX_SIZE,
'BLOCK_SIZE': BLOCK_SIZE,
}
# compile the kernel code
mod = compiler.SourceModule(kernel_code)
# get the kernel function from the compiled module
matrixmul = mod.get_function("MatrixMulKernel")
# call the kernel on the card
t1 = time()
matrixmul(
# inputs
a_gpu, b_gpu,
# output
c_gpu,
# grid of multiple blocks
grid = (MATRIX_SIZE/TILE_SIZE, MATRIX_SIZE/TILE_SIZE),
# block of multiple threads
block = (TILE_SIZE, TILE_SIZE, 1),
)
t2 = time()
t_gpu = t2-t1
# print the results
print("-" * 80)
print("Matrix A (GPU): ")
print(a_gpu.get())
print("-" * 80)
print("Matrix B (GPU): ")
print(b_gpu.get())
print("-" * 80)
print("Matrix C (GPU): ")
print(c_gpu.get())
print("-" * 80)
print("Matrix C (CPU): ")
print(c_cpu)
print("-" * 80)
print("CPU-GPU Difference: ")
print(c_cpu-c_gpu.get())
print("CPU Time ", t_cpu)
print("GPU Time ", t_gpu)
np.allclose(c_cpu, c_gpu.get() )
The C-code is
#include <stdio.h>
#include <stdlib.h>
#include <cuComplex.h>
/* __global__ void MatrixMulKernel(...)
* This is the main Matrix Multiplication Kernel
* It is executed on the device (GPU).
*/
__global__ void MatrixMulKernel(cuFloatComplex *Md, cuFloatComplex *Nd, cuFloatComplex *Pd, int Width, int TILE_WIDTH)
{
// Calculate the row index of the Pd element and M
int Row = blockIdx.y*TILE_WIDTH + threadIdx.y;
// Calculate the column index of the Pd element and N
int Col = blockIdx.x*TILE_WIDTH + threadIdx.x;
cuFloatComplex Pvalue = make_cuFloatComplex(0,0);
// each thread computes one element of the block sub-matrix
for(int k = 0; k < Width; k++)
{
Pvalue = cuCaddf(Pvalue,cuCmulf(Md[Row*Width + k],Nd[k*Width + Col]));
}
Pd[Row*Width + Col] = make_cuFloatComplex(cuCrealf(Pvalue),cuCimagf(Pvalue));
}
/* void MatrixMultiplication(...)
* This is the stub function for the matrix multiplication kernel
* It is executed on the host. It takes inputs from the main() function
* and declares memory, copies data to the device, invokes the kernel
* and copies the result from the device back to the host.
*/
void MatrixMultiplication(cuFloatComplex *M, cuFloatComplex *N, cuFloatComplex *P, int Width, int TILE_WIDTH)
{
int size = Width*Width*sizeof(cuFloatComplex);
cuFloatComplex *Md, *Nd, *Pd;
// Transfer M and N to device memory
cudaMalloc((void**) &Md, size);
cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice);
cudaMalloc((void**) &Nd, size);
cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice);
// allocate P on the device
cudaMalloc((void**) &Pd, size);
// setup the execution configuration
dim3 dimGrid(Width/TILE_WIDTH, Width/TILE_WIDTH);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
// Launch the device computation kernel
MatrixMulKernel<<<dimGrid,dimBlock>>>(Md, Nd, Pd, Width, TILE_WIDTH);
// Transfer P from device to host
cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost);
// Free device matrices
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
}
/* void printMatrix(..)
* This is a function used to print the matrix
*/
void printMatrix(cuFloatComplex *M, int Width)
{
for(int i = 0; i < Width; i++)
{
for(int j = 0; j < Width; j++)
{
printf(" %f+i%f", cuCrealf(M[i*Width + j]), cuCimagf(M[i*Width + j]));
}
printf("\n");
}
}
int main()
{
/* Define dimension of matrix (Width x Width) */
int Width = 4;
/* Define dimension of tile
* This should be less than 512
*/
int TILE_WIDTH = 2;
/* Define pointers for row major matrices */
cuFloatComplex *M, *N, *P;
M = (cuFloatComplex *)malloc(Width*Width*sizeof(cuFloatComplex));
N = (cuFloatComplex *)malloc(Width*Width*sizeof(cuFloatComplex));
P = (cuFloatComplex *)malloc(Width*Width*sizeof(cuFloatComplex));
/* Fill matrices arbitrarily */
for(int i = 0; i < Width; i++)
{
for(int j = 0; j < Width; j++)
{
M[i*Width + j] = make_cuFloatComplex(1,0);
N[i*Width + j] = make_cuFloatComplex(1,2);
P[i*Width + j] = make_cuFloatComplex(0,0);
}
}
/* code to print matrices using helper function */
printf("Matrix M is \n\n");
printMatrix(M, Width);
printf("\nMatrix N is \n\n");
printMatrix(N, Width);
/* Call the stub function for matrix multiplication */
MatrixMultiplication(M, N, P, Width, TILE_WIDTH);
printf("\nMatrix P is \n\n");
printMatrix(P, Width);
free(M);
free(N);
free(P);
return 0;
}
The output of the Python code is
Matrix C (GPU):
[[ 1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j
1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j]
[ 1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j
1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j]
[ -9.01080877e+306 -5.19870527e+306j -1.45379609e+307 -8.65694841e+306j
-4.14125486e+306 -2.15325816e+306j -5.83708063e+306 -3.25935506e+306j]
[ -1.44828853e+306 -1.44828853e+306j -2.32949855e+306 -2.32949855e+306j
-3.78945180e+306 -3.78945180e+306j -6.54203686e+306 -6.54203686e+306j]]
--------------------------------------------------------------------------------
Matrix C (CPU):
[[ 4.+8.j 4.+8.j 4.+8.j 4.+8.j]
[ 4.+8.j 4.+8.j 4.+8.j 4.+8.j]
[ 4.+8.j 4.+8.j 4.+8.j 4.+8.j]
[ 4.+8.j 4.+8.j 4.+8.j 4.+8.j]]
The C-output is
Matrix P is
4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000
4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000
4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000
4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000
I'd appreciate if someone could please point out the error in my Python code. I am trying to meet deadlines for my thesis, and all the rest of my code is in Python, so I have no time to port it to C.
Thanks!
======================
This is most probably a precision issue. I fixed it by replacing the host transfer and empty matrix creation code with the following...
# transfer host (CPU) memory to device (GPU) memory
a_gpu = gpuarray.to_gpu(a_cpu.astype(np.complex64))
b_gpu = gpuarray.to_gpu(b_cpu.astype(np.complex64))
# create empty gpuarry for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.complex64)
Hope this is helpful.
This is most probably a precision issue. I fixed it by replacing the host transfer and empty matrix creation code with the following...
# transfer host (CPU) memory to device (GPU) memory
a_gpu = gpuarray.to_gpu(a_cpu.astype(np.complex64))
b_gpu = gpuarray.to_gpu(b_cpu.astype(np.complex64))
# create empty gpuarry for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.complex64)
But I do find a problem in using PyCUDA to multiply large complex valued matrices. For instance, taking MATRIX_SIZE = 5040 and TILE_SIZE = 16 (which may not be a good choice from the standpoint of hardware), CUDA-C does manage to multiply the matrices, but PyCUDA crashes. Why should this happen?