In experimenting with PyOpenCL, I noticed my code was running slower than expected. It turned out that it ran faster on CPU than on GPU (running on PyOpenCL in both cases, achieving just 1 GFLOP).
To debug this, I then tried naive matrix multiplication as a comparison, and only see a 2x speedup on GPU vs CPU (~20 GFLOPs vs ~10 GFLOPs). My system is i7 8750H + GTX 1070 Max-Q.
Does anyone have any thoughts they could share about what I might be doing wrong? I know that the code below is not optimal, but I would have expected that with the much increased floating point capability and memory bandwidth of my GPU there would be a bigger difference.
import pyopencl as cl
import pyopencl.array as pycl_array
import numpy as np
import numpy.linalg as la
import time
size = 4000
m1 = np.random.normal(size = [size,size]).astype(np.float32)
m2 = np.random.normal(size = [size,size]).astype(np.float32)
ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)
a = pycl_array.to_device(queue, m1)
b = pycl_array.to_device(queue, m2)
res = pycl_array.empty_like(a)
prg = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global const float * a,
__global const float * b, __global float * res) {
int i = get_global_id(0);
int j = get_global_id(1);
res[size * i + j] = 0;
for (int k = 0; k < size; k++)
{
res[size * i + j] += a[k + size * j] * b[i + size * k];
}
}
""").build()
t = time.time()
task = prg.multiplymatrices(queue, m1.shape, None, np.int32(size), a.data, b.data, res.data)
task.wait()
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))
Following the suggestion to use a local register to accumulate the results, I modified my code as follows, getting about 90 gflops at about 360 GB/s of memory bandwidth (which is the maximum bandwidth my GPU is capable of). Improving the gflops would require a more sophisticated matrix multiplication algorithm which reuses the same data stored in cache multiple times, but is outside the scope of this question.
__kernel void multiplymatrices(const unsigned int size, __global const float * a,
__global const float * b, __global float * res) {
int i = get_global_id(0);
int j = get_global_id(1);
float temp = 0;
for (int k = 0; k < size; k++)
{
temp += a[k + size * j] * b[i + size * k];
}
res[size * i + j] = temp;
}
EDIT: For those looking for an example of fast matrix multiplication, which showcases using local memory with workgroups as well as 2D register tiling, I have created the below based on the tutorial here. It gets 1.4 TFLOPs on my GPU.
prg4 = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global const float * A,
__global const float * B, __global float * res) {
int ig = get_group_id(0);
int jg = get_group_id(1);
int il = get_local_id(0);
int jl = get_local_id(1);
const int memtile = 64;
const int regtile = 4;
volatile int il2;
volatile int jl2;
int iglob = memtile*ig + regtile*il;
int jglob = memtile*jg + regtile*jl;
__local float Asub[64][64];
__local float Bsub[64][64];
float acc[4][4];
float Areg;
float Breg[4];
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
acc[k][m] = 0;
}
}
for (int l = 0; l < size/memtile; l++) {
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
il2 = il*regtile + k;
jl2 = jl*regtile + m;
Asub[il2][jl2] = A[size*(iglob + k) + memtile*l + jl2];
Bsub[il2][jl2] = B[size*(memtile*l + il2) + jglob + m];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < regtile; k++) {
for (int r = 0; r < regtile; r++) {
Breg[r] = Bsub[il*regtile+k][jl*regtile+r];
}
for (int m = 0; m < regtile; m++) {
Areg = Asub[il*regtile+m][jl*regtile+k];
for (int r = 0; r < regtile; r++) {
acc[k][m] += Areg*Breg[r];
}
}
}
}
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
res[size*(iglob+k)+jglob+m] = acc[k][m];
}
}
}
""").build()
t = time.time()
memtile = 64
regtile = 4
wgsize = int(memtile/regtile)
global_size = int(size/regtile)
task = prg4.multiplymatrices(queue, (global_size,global_size), (wgsize,wgsize), np.int32(size), a.data, b.data, res.data)
queue.finish()
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))
print("GB/s total", 2*4*size**3/(tot_time*1000**3))
print("GB/s global", 2*4*size**3/(memtile*tot_time*1000**3))