PyOpenCL - not seeing expected speedup

In experimenting with PyOpenCL, I noticed my code was running slower than expected. It turned out that it ran faster on CPU than on GPU (running on PyOpenCL in both cases, achieving just 1 GFLOP).

To debug this, I then tried naive matrix multiplication as a comparison, and only see a 2x speedup on GPU vs CPU (~20 GFLOPs vs ~10 GFLOPs). My system is i7 8750H + GTX 1070 Max-Q.

Does anyone have any thoughts they could share about what I might be doing wrong? I know that the code below is not optimal, but I would have expected that with the much increased floating point capability and memory bandwidth of my GPU there would be a bigger difference.

import pyopencl as cl
import pyopencl.array as pycl_array
import numpy as np
import numpy.linalg as la
import time

size = 4000

m1 = np.random.normal(size = [size,size]).astype(np.float32)
m2 = np.random.normal(size = [size,size]).astype(np.float32)

ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)

a = pycl_array.to_device(queue, m1)
b = pycl_array.to_device(queue, m2)
res = pycl_array.empty_like(a)

prg = cl.Program(ctx, """
    __kernel void multiplymatrices(const unsigned int size, __global const float * a, 
    __global const float * b, __global float * res) {

    int i = get_global_id(0); 
    int j = get_global_id(1);

    res[size * i + j] = 0;

    for (int k = 0; k < size; k++)
    {
        res[size * i + j] += a[k + size * j] * b[i + size * k];
    }

    }
    """).build()

t = time.time()
task = prg.multiplymatrices(queue, m1.shape, None, np.int32(size), a.data, b.data, res.data)

task.wait()
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))

Solution

Following the suggestion to use a local register to accumulate the results, I modified my code as follows, getting about 90 gflops at about 360 GB/s of memory bandwidth (which is the maximum bandwidth my GPU is capable of). Improving the gflops would require a more sophisticated matrix multiplication algorithm which reuses the same data stored in cache multiple times, but is outside the scope of this question.

__kernel void multiplymatrices(const unsigned int size, __global const float * a, 
__global const float * b, __global float * res) {

int i = get_global_id(0); 
int j = get_global_id(1);
float temp = 0;

for (int k = 0; k < size; k++)
{
    temp += a[k + size * j] * b[i + size * k];
}
res[size * i + j] = temp;
}

EDIT: For those looking for an example of fast matrix multiplication, which showcases using local memory with workgroups as well as 2D register tiling, I have created the below based on the tutorial here. It gets 1.4 TFLOPs on my GPU.

prg4 = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global const float * A, 
__global const float * B, __global float * res) {

int ig = get_group_id(0);
int jg = get_group_id(1);
int il = get_local_id(0);
int jl = get_local_id(1);

const int memtile = 64;
const int regtile = 4;
volatile int il2;
volatile int jl2;
int iglob = memtile*ig + regtile*il;
int jglob = memtile*jg + regtile*jl;

__local float Asub[64][64];
__local float Bsub[64][64];
float acc[4][4];
float Areg;
float Breg[4];

for (int k = 0; k < regtile; k++) {
    for (int m = 0; m < regtile; m++) {
        acc[k][m] = 0;
    }
}

for (int l = 0; l < size/memtile; l++) {
    for (int k = 0; k < regtile; k++) {
        for (int m = 0; m < regtile; m++) {
            il2 = il*regtile + k;
            jl2 = jl*regtile + m;
            Asub[il2][jl2] = A[size*(iglob + k) + memtile*l + jl2];
            Bsub[il2][jl2] = B[size*(memtile*l + il2) + jglob + m];
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    for (int k = 0; k < regtile; k++) {
        for (int r = 0; r < regtile; r++) {
            Breg[r] = Bsub[il*regtile+k][jl*regtile+r];
        }
        for (int m = 0; m < regtile; m++) {
            Areg = Asub[il*regtile+m][jl*regtile+k];
            for (int r = 0; r < regtile; r++) {
                acc[k][m] += Areg*Breg[r];
            }
        }
    }
}
for (int k = 0; k < regtile; k++) {
    for (int m = 0; m < regtile; m++) {
        res[size*(iglob+k)+jglob+m] = acc[k][m];
    }
}
}
""").build()

t = time.time()
memtile = 64
regtile = 4
wgsize = int(memtile/regtile)
global_size = int(size/regtile)
task = prg4.multiplymatrices(queue, (global_size,global_size), (wgsize,wgsize), np.int32(size), a.data, b.data, res.data)

queue.finish()
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))
print("GB/s total", 2*4*size**3/(tot_time*1000**3))
print("GB/s global", 2*4*size**3/(memtile*tot_time*1000**3))