I am trying to learn PyOpenCl. I am following various tutorials/examples that I found online, and I have been trying to put together a simple matrix multiplication. I fail to see why I cannot get the right results: it seems to me that the for cycle in my kernel is not being executed (the output C_flat is always zeros) or maybe I am managing in the wrong way some memory. Can anyone give me any suggestions? Thanks a lot!
Here is the code:
import numpy as np
import pyopencl as cl
import time
def create_input_memory(context, input_arrays):
return [(array, cl.Buffer(context, flags=cl.mem_flags.READ_ONLY, size=array.nbytes))
for array in input_arrays]
def create_output_memory(context, output_arrays):
return [(array, cl.Buffer(context, flags=cl.mem_flags.WRITE_ONLY, size=array.nbytes))
for array in output_arrays]
def matrix_multiply_gpu(A, B):
A_height, A_width = A.shape[0], A.shape[1]
B_height, B_width = B.shape[0], B.shape[1]
C = np.zeros((A_height, B_width))
A_flat = A.flatten()
B_flat = B.flatten()
C_flat = C.flatten()
print(C_flat)
kernel_source = """
kernel void mul(int Wa, int Ha, int Wb, int Hb,
global float *input_a,
global float *input_b,
global float *result){
/* ROW MAJOR notation (I imagine the "GPU matrix") --> no, just model*/
int row = get_global_id(0);
int col = get_global_id(1);
float sum = 0.0f;
for (int i = 0; i < Wa; i++){
sum += input_a[row * Wa + i] * input_b[i * Wb + col];
}
result[row * Wb + col] = sum;
}
"""
platforms = cl.get_platforms()
context = cl.Context(dev_type=cl.device_type.GPU,
properties=[(cl.context_properties.PLATFORM, platforms[0])])
gpu_program_source = cl.Program(context, kernel_source)
gpu_program = gpu_program_source.build()
input_tuples = create_input_memory(context,
(A_flat, B_flat))
output_tuples = create_output_memory(context, (C_flat,))
gpu_queue = cl.CommandQueue(context)
kernel_arguments = [buffer for (_,buffer) in input_tuples]
kernel_arguments += [buffer for (_,buffer) in output_tuples]
gpu_program.mul(gpu_queue, (1024,), (32,),
np.int32(A_height), np.int32(A_width), np.int32(B_height),
np.int32(B_width), *kernel_arguments)
for (array, buffer) in output_tuples:
cl.enqueue_copy(gpu_queue, src=buffer, dest=array)
#wait for everyone to finish
gpu_queue.finish()
return C_flat
if __name__=='__main__':
A, B = np.ones((100, 100)), np.ones((100, 100))
C = matrix_multiply_gpu(A, B)
print("\n", C, "\n")
Here are some of the issues with your code:
create_...
functions.dtype
of the data you are sending, and make sure it matches with what your kernel expects. I have spent countless hours debugging just because of this.Here is the code:
import numpy as np
import pyopencl as cl
import pyopencl.array
def matrix_multiply_gpu(A, B):
# ----------------------------this is your code, with minor changes
A_height, A_width = A.shape
B_height, B_width = B.shape
C = np.empty((A_height, B_width), dtype=np.float32) # some changes
A_flat = A.flatten()
B_flat = B.flatten()
C_flat = C.flatten()
platforms = cl.get_platforms()
context = cl.Context(dev_type=cl.device_type.GPU,
properties=[(cl.context_properties.PLATFORM, platforms[0])])
gpu_queue = cl.CommandQueue(context)
# ------------------------------------------------------------------
# --------------------------------------This is new or modified code
size = A_height * B_width
kernel_source = """
kernel void mul(int Ha, int Wa, int Hb, int Wb,
global const float *input_a,
global const float *input_b,
global float *result ){
int gid = get_global_id(0);
int Arow = gid / Wb;
int Bcol = gid % Wb;
float sum = 0.0f;
for (int i = 0; i < Wa; i++)
sum += input_a[Arow*Wa+i] * input_b[i*Wb+Bcol];
result[gid] = sum;
}
"""
Ad = cl.array.to_device(gpu_queue, A_flat)
Bd = cl.array.to_device(gpu_queue, B_flat)
Cd = cl.array.to_device(gpu_queue, C_flat)
# --------------------------------- and the return is different too
# -------------------------------your code again with minor changes
gpu_program_source = cl.Program(context, kernel_source)
gpu_program = gpu_program_source.build()
gpu_program.mul(gpu_queue, (size,), None, # some changes here
np.int32(A_height), np.int32(A_width),
np.int32(B_height), np.int32(B_width),
Ad.data, Bd.data, Cd.data)
# -----------------------------------------------------------------
return Cd.get().reshape((A_height, B_width))
# or send it flattened if you wish, without the reshape
if __name__=='__main__':
from pprint import pprint
A = 2.0*np.ones((5, 3), dtype=np.float32)
B = 3.0*np.ones((3, 4), dtype=np.float32)
C = matrix_multiply_gpu(A, B)
pprint(C)