Search code examples
openclpyopencl

PyOpenCL - not seeing expected speedup


In experimenting with PyOpenCL, I noticed my code was running slower than expected. It turned out that it ran faster on CPU than on GPU (running on PyOpenCL in both cases, achieving just 1 GFLOP).

To debug this, I then tried naive matrix multiplication as a comparison, and only see a 2x speedup on GPU vs CPU (~20 GFLOPs vs ~10 GFLOPs). My system is i7 8750H + GTX 1070 Max-Q.

Does anyone have any thoughts they could share about what I might be doing wrong? I know that the code below is not optimal, but I would have expected that with the much increased floating point capability and memory bandwidth of my GPU there would be a bigger difference.

import pyopencl as cl
import pyopencl.array as pycl_array
import numpy as np
import numpy.linalg as la
import time

size = 4000

m1 = np.random.normal(size = [size,size]).astype(np.float32)
m2 = np.random.normal(size = [size,size]).astype(np.float32)

ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)

a = pycl_array.to_device(queue, m1)
b = pycl_array.to_device(queue, m2)
res = pycl_array.empty_like(a)

prg = cl.Program(ctx, """
    __kernel void multiplymatrices(const unsigned int size, __global const float * a, 
    __global const float * b, __global float * res) {

    int i = get_global_id(0); 
    int j = get_global_id(1);

    res[size * i + j] = 0;

    for (int k = 0; k < size; k++)
    {
        res[size * i + j] += a[k + size * j] * b[i + size * k];
    }

    }
    """).build()

t = time.time()
task = prg.multiplymatrices(queue, m1.shape, None, np.int32(size), a.data, b.data, res.data)

task.wait()
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))

Solution

  • Following the suggestion to use a local register to accumulate the results, I modified my code as follows, getting about 90 gflops at about 360 GB/s of memory bandwidth (which is the maximum bandwidth my GPU is capable of). Improving the gflops would require a more sophisticated matrix multiplication algorithm which reuses the same data stored in cache multiple times, but is outside the scope of this question.

    __kernel void multiplymatrices(const unsigned int size, __global const float * a, 
    __global const float * b, __global float * res) {
    
    int i = get_global_id(0); 
    int j = get_global_id(1);
    float temp = 0;
    
    for (int k = 0; k < size; k++)
    {
        temp += a[k + size * j] * b[i + size * k];
    }
    res[size * i + j] = temp;
    }
    

    EDIT: For those looking for an example of fast matrix multiplication, which showcases using local memory with workgroups as well as 2D register tiling, I have created the below based on the tutorial here. It gets 1.4 TFLOPs on my GPU.

    prg4 = cl.Program(ctx, """
    __kernel void multiplymatrices(const unsigned int size, __global const float * A, 
    __global const float * B, __global float * res) {
    
    int ig = get_group_id(0);
    int jg = get_group_id(1);
    int il = get_local_id(0);
    int jl = get_local_id(1);
    
    const int memtile = 64;
    const int regtile = 4;
    volatile int il2;
    volatile int jl2;
    int iglob = memtile*ig + regtile*il;
    int jglob = memtile*jg + regtile*jl;
    
    __local float Asub[64][64];
    __local float Bsub[64][64];
    float acc[4][4];
    float Areg;
    float Breg[4];
    
    for (int k = 0; k < regtile; k++) {
        for (int m = 0; m < regtile; m++) {
            acc[k][m] = 0;
        }
    }
    
    for (int l = 0; l < size/memtile; l++) {
        for (int k = 0; k < regtile; k++) {
            for (int m = 0; m < regtile; m++) {
                il2 = il*regtile + k;
                jl2 = jl*regtile + m;
                Asub[il2][jl2] = A[size*(iglob + k) + memtile*l + jl2];
                Bsub[il2][jl2] = B[size*(memtile*l + il2) + jglob + m];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    
        for (int k = 0; k < regtile; k++) {
            for (int r = 0; r < regtile; r++) {
                Breg[r] = Bsub[il*regtile+k][jl*regtile+r];
            }
            for (int m = 0; m < regtile; m++) {
                Areg = Asub[il*regtile+m][jl*regtile+k];
                for (int r = 0; r < regtile; r++) {
                    acc[k][m] += Areg*Breg[r];
                }
            }
        }
    }
    for (int k = 0; k < regtile; k++) {
        for (int m = 0; m < regtile; m++) {
            res[size*(iglob+k)+jglob+m] = acc[k][m];
        }
    }
    }
    """).build()
    
    t = time.time()
    memtile = 64
    regtile = 4
    wgsize = int(memtile/regtile)
    global_size = int(size/regtile)
    task = prg4.multiplymatrices(queue, (global_size,global_size), (wgsize,wgsize), np.int32(size), a.data, b.data, res.data)
    
    queue.finish()
    tot_time = time.time()-t
    print("gflops", 2*size**3/(tot_time*1000**3))
    print("GB/s total", 2*4*size**3/(tot_time*1000**3))
    print("GB/s global", 2*4*size**3/(memtile*tot_time*1000**3))