python performance opencl cython pyopencl

OpenCL running slower than Cython (on CPU)

I have a program to calculate the potential/force of a give data point (pos) to the rest of the data. It was originally written in Cython, and I tried to use PyOpenCL (set device to Intel(R) Core(TM) i7-4750HQ CPU @ 2.00GHz on my 2013 Macbook Pro) hoping to bump up the speed but the result is actually much slower than Cython. Besides, the Cython version is double precision while CL is only float. The results are confirmed to be equal.

The ipython notebook is as follow, for a 2mil x 2 data, PyOpenCL took 176ms and Cython only used 82ms. Is there a way to optimise and reduce the overhead? Many thanks

from __future__ import division
import numpy as np
import pyopencl as cl
import pyopencl.array
import math
import time
%load_ext pyopencl.ipython_ext
%load_ext Cython
%pylab inline

# prepare data
datad = np.random.rand(2000000,2)-[0.5,  0.5] # Double
data = datad.astype(np.float32)
N, dim = data.shape[0], data.shape[1]
sigma = 0.04
i = 2
pos = np.array(data[i,:]) # float
posd = np.array(datad[i,:]) #double 
dt = 0.005
resistant = 0.9995

kernelsource = """
__kernel void forceFinder(
    const int N,
    const int dim,
    const float sigma,
    __global float* datacl,
    __global float* poscl,
    __global float* res)
{
    int i = get_global_id(0); // Global id;
    float f_sum ;
    int k;
    float sigma2 = sigma * sigma;
    if (i < N) {
        f_sum = 0.0;
        for (k = 0; k < dim; k++)
        {
            f_sum += (poscl[k] - datacl[i * dim + k]) * (poscl[k] - datacl[i * dim + k]);
        }
        for (k = 0; k < dim; k++)
        { 
            res[i * dim + k] =  (datacl[i * dim + k] - poscl[k]) * exp(-f_sum/sigma2)/sigma2;
        }
    }
}
"""

# Setup PyOpenCl
platform = cl.get_platforms()[0]
device = platform.get_devices()[0] # Get the GPU ID
ctx = cl.Context([device])   # Tell CL to use GPU
queue = cl.CommandQueue(ctx) # Create a command queue for the target device.
program = cl.Program(ctx, kernelsource).build()
size = N * dim
datacl = data.reshape((size,))
rescl = np.empty(size).astype(np.float32)
rescl.fill(0.0)
datacl_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = datacl)
pos_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = pos)
rescl_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = rescl)
forceFinder = program.forceFinder
forceFinder.set_scalar_arg_dtypes([np.uint32, np.uint32, np.float32, None, None, None])
globalrange = (N, dim)
localrange = None

# Run CL
t0 = time.time()
forceFinder(queue, globalrange, localrange, N, dim, sigma, datacl_buf, \
            pos_buf, rescl_buf)
queue.finish()
cl.enqueue_copy(queue, rescl, rescl_buf)  
result = rescl.reshape((N,dim))
t1 = time.time()
print (t1-t0)

# Now Cython

%%cython
cimport numpy as np
import numpy as np
from libc.stdlib cimport rand, malloc, free
from libc.math cimport exp
def PTSM(np.ndarray[np.float64_t, ndim = 1] position, np.ndarray[np.float64_t, ndim = 2] X,\
          double sigma=0.25,\
         double dt=0.01, double r=0.99, int Nsamp=1000):
    cdef int N, dim
    N, dim = X.shape[0], X.shape[1]
    cdef int i,j, steps  # These 3 are for iterations. 
    cdef double sigma2
    cdef double force1p_sum
    cdef double *force = <double *>malloc(dim * sizeof(double))
    sigma2 = sigma * sigma
    #--------------------
       # Force
#     for steps in range(Nsamp): 
    for i in range(dim):
        force[i] = 0
    for j in range (N):
        for i in range (dim):
            force1p_sum  +=   (position[i] - X[j,i]) * (position[i] - X[j,i])       
        for i in range (dim):          
            force[i] += ( X[j,i] - position[i]) * exp(- force1p_sum /sigma2) / sigma2   
        force1p_sum = 0
    resultForce = np.zeros(dim)
    for i in range(dim): 
        resultForce[i] = force[i]  
    free(force)
    return resultForce

    t0 = time.time()
    f = PTSM(posd, datad, sigma, dt, resistant)
    t1 = time.time()
    print (t1 - t0)

Solution

You global range is globalrange = (N, dim). While internally you are using only get_global_id(0) and looping in a for loop the dim.

So effectively using N*dim*dim vs N*dim, an extra dim dimension operations that does not affect into the output (2 internal threads are doing the same work and writing the same to the output). It makes sense: 176ms vs 82ms almost double. You have the same hardware and same device utilization using both methods so it seems logical.

Also, some optimizations:

I wouldn't use queue.finish() before the copy. Since that causes a implicit block of the CL device.
This:

f_sum += (poscl[k] - datacl[i * dim + k]) * (poscl[k] - datacl[i * dim + k]);
- Should be: (to avoid extra global reads)
f_sum += pown(poscl[k] - datacl[i * dim + k]), 2);
Change the data shape to have coalesced access. At the moment each work item accesses a matrix of i*k in small strides. While a matrix shaped in dim mayor order can provide coalesced access. Change it from i*k to k*i.
poscl should be constant and read only.
You should calculate first poscl-datacl. Store it in a private array. Then use it in the 2 loops. Avoiding extra global reads.

Moded code (not tested): NOTE: I did't add the matrix ordering change.

# prepare data
datad = np.random.rand(2000000,2)-[0.5,  0.5] # Double
data = datad.astype(np.float32)
N, dim = data.shape[0], data.shape[1]
sigma = 0.04
i = 2
pos = np.array(data[i,:]) # float
posd = np.array(datad[i,:]) #double 
dt = 0.005
resistant = 0.9995

kernelsource = """
__kernel void forceFinder(
    const int N,
    const int dim,
    const float sigma,
    __global float* datacl,
    __constant float* poscl,
    __global float* res)
{
    int i = get_global_id(0); // Global id;
    float f_sum ;
    int k;
    float sigma2 = sigma * sigma;
    if (i < N) {
        f_sum = 0.0;
        float t[2]; //INCREASE TO THE MAX "DIM" POSSIBLE
        for (k = 0; k < dim; k++){
            t = poscl[k] - datacl[i * dim + k];
        }
        for (k = 0; k < dim; k++){
            f_sum +=  pown(t,2);
        }
        for (k = 0; k < dim; k++){ 
            res[i * dim + k] = (-t) * exp(-f_sum/sigma2)/sigma2;
        }
    }
}
"""

# Setup PyOpenCl
platform = cl.get_platforms()[0]
device = platform.get_devices()[0] # Get the GPU ID
ctx = cl.Context([device])   # Tell CL to use GPU
queue = cl.CommandQueue(ctx) # Create a command queue for the target device.
program = cl.Program(ctx, kernelsource).build()
size = N * dim
datacl = data.reshape((size,))
rescl = np.empty(size).astype(np.float32)
rescl.fill(0.0)
datacl_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = datacl)
pos_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = pos)
rescl_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, rescl.nbytes)
forceFinder = program.forceFinder
forceFinder.set_scalar_arg_dtypes([np.uint32, np.uint32, np.float32, None, None, None])
globalrange = (N, 1)
localrange = None

# Run CL
t0 = time.time()
forceFinder(queue, globalrange, localrange, N, dim, sigma, datacl_buf, \
            pos_buf, rescl_buf)
cl.enqueue_copy(queue, rescl, rescl_buf)  
queue.finish()
result = rescl.reshape((N,dim))
t1 = time.time()
print (t1-t0)