I have a program to calculate the potential/force of a give data point (pos) to the rest of the data. It was originally written in Cython, and I tried to use PyOpenCL (set device to Intel(R) Core(TM) i7-4750HQ CPU @ 2.00GHz on my 2013 Macbook Pro) hoping to bump up the speed but the result is actually much slower than Cython. Besides, the Cython version is double precision while CL is only float. The results are confirmed to be equal.
The ipython notebook is as follow, for a 2mil x 2 data, PyOpenCL took 176ms and Cython only used 82ms. Is there a way to optimise and reduce the overhead? Many thanks
:
from __future__ import division
import numpy as np
import pyopencl as cl
import pyopencl.array
import math
import time
%load_ext pyopencl.ipython_ext
%load_ext Cython
%pylab inline
# prepare data
datad = np.random.rand(2000000,2)-[0.5, 0.5] # Double
data = datad.astype(np.float32)
N, dim = data.shape[0], data.shape[1]
sigma = 0.04
i = 2
pos = np.array(data[i,:]) # float
posd = np.array(datad[i,:]) #double
dt = 0.005
resistant = 0.9995
kernelsource = """
__kernel void forceFinder(
const int N,
const int dim,
const float sigma,
__global float* datacl,
__global float* poscl,
__global float* res)
{
int i = get_global_id(0); // Global id;
float f_sum ;
int k;
float sigma2 = sigma * sigma;
if (i < N) {
f_sum = 0.0;
for (k = 0; k < dim; k++)
{
f_sum += (poscl[k] - datacl[i * dim + k]) * (poscl[k] - datacl[i * dim + k]);
}
for (k = 0; k < dim; k++)
{
res[i * dim + k] = (datacl[i * dim + k] - poscl[k]) * exp(-f_sum/sigma2)/sigma2;
}
}
}
"""
# Setup PyOpenCl
platform = cl.get_platforms()[0]
device = platform.get_devices()[0] # Get the GPU ID
ctx = cl.Context([device]) # Tell CL to use GPU
queue = cl.CommandQueue(ctx) # Create a command queue for the target device.
program = cl.Program(ctx, kernelsource).build()
size = N * dim
datacl = data.reshape((size,))
rescl = np.empty(size).astype(np.float32)
rescl.fill(0.0)
datacl_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = datacl)
pos_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = pos)
rescl_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = rescl)
forceFinder = program.forceFinder
forceFinder.set_scalar_arg_dtypes([np.uint32, np.uint32, np.float32, None, None, None])
globalrange = (N, dim)
localrange = None
# Run CL
t0 = time.time()
forceFinder(queue, globalrange, localrange, N, dim, sigma, datacl_buf, \
pos_buf, rescl_buf)
queue.finish()
cl.enqueue_copy(queue, rescl, rescl_buf)
result = rescl.reshape((N,dim))
t1 = time.time()
print (t1-t0)
# Now Cython
%%cython
cimport numpy as np
import numpy as np
from libc.stdlib cimport rand, malloc, free
from libc.math cimport exp
def PTSM(np.ndarray[np.float64_t, ndim = 1] position, np.ndarray[np.float64_t, ndim = 2] X,\
double sigma=0.25,\
double dt=0.01, double r=0.99, int Nsamp=1000):
cdef int N, dim
N, dim = X.shape[0], X.shape[1]
cdef int i,j, steps # These 3 are for iterations.
cdef double sigma2
cdef double force1p_sum
cdef double *force = <double *>malloc(dim * sizeof(double))
sigma2 = sigma * sigma
#--------------------
# Force
# for steps in range(Nsamp):
for i in range(dim):
force[i] = 0
for j in range (N):
for i in range (dim):
force1p_sum += (position[i] - X[j,i]) * (position[i] - X[j,i])
for i in range (dim):
force[i] += ( X[j,i] - position[i]) * exp(- force1p_sum /sigma2) / sigma2
force1p_sum = 0
resultForce = np.zeros(dim)
for i in range(dim):
resultForce[i] = force[i]
free(force)
return resultForce
t0 = time.time()
f = PTSM(posd, datad, sigma, dt, resistant)
t1 = time.time()
print (t1 - t0)
You global range is globalrange = (N, dim)
. While internally you are using only get_global_id(0)
and looping in a for loop the dim
.
So effectively using N*dim*dim
vs N*dim
, an extra dim dimension operations that does not affect into the output (2 internal threads are doing the same work and writing the same to the output). It makes sense: 176ms vs 82ms
almost double. You have the same hardware and same device utilization using both methods so it seems logical.
Also, some optimizations:
I wouldn't use queue.finish()
before the copy. Since that causes a implicit block of the CL device.
This:
f_sum += (poscl[k] - datacl[i * dim + k]) * (poscl[k] - datacl[i * dim + k]);
f_sum += pown(poscl[k] - datacl[i * dim + k]), 2);
Change the data shape to have coalesced access. At the moment each work item accesses a matrix of i*k
in small strides. While a matrix shaped in dim mayor order can provide coalesced access. Change it from i*k
to k*i
.
poscl
should be constant and read only.
You should calculate first poscl-datacl
. Store it in a private array. Then use it in the 2 loops. Avoiding extra global reads.
Moded code (not tested): NOTE: I did't add the matrix ordering change.
# prepare data
datad = np.random.rand(2000000,2)-[0.5, 0.5] # Double
data = datad.astype(np.float32)
N, dim = data.shape[0], data.shape[1]
sigma = 0.04
i = 2
pos = np.array(data[i,:]) # float
posd = np.array(datad[i,:]) #double
dt = 0.005
resistant = 0.9995
kernelsource = """
__kernel void forceFinder(
const int N,
const int dim,
const float sigma,
__global float* datacl,
__constant float* poscl,
__global float* res)
{
int i = get_global_id(0); // Global id;
float f_sum ;
int k;
float sigma2 = sigma * sigma;
if (i < N) {
f_sum = 0.0;
float t[2]; //INCREASE TO THE MAX "DIM" POSSIBLE
for (k = 0; k < dim; k++){
t = poscl[k] - datacl[i * dim + k];
}
for (k = 0; k < dim; k++){
f_sum += pown(t,2);
}
for (k = 0; k < dim; k++){
res[i * dim + k] = (-t) * exp(-f_sum/sigma2)/sigma2;
}
}
}
"""
# Setup PyOpenCl
platform = cl.get_platforms()[0]
device = platform.get_devices()[0] # Get the GPU ID
ctx = cl.Context([device]) # Tell CL to use GPU
queue = cl.CommandQueue(ctx) # Create a command queue for the target device.
program = cl.Program(ctx, kernelsource).build()
size = N * dim
datacl = data.reshape((size,))
rescl = np.empty(size).astype(np.float32)
rescl.fill(0.0)
datacl_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = datacl)
pos_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = pos)
rescl_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, rescl.nbytes)
forceFinder = program.forceFinder
forceFinder.set_scalar_arg_dtypes([np.uint32, np.uint32, np.float32, None, None, None])
globalrange = (N, 1)
localrange = None
# Run CL
t0 = time.time()
forceFinder(queue, globalrange, localrange, N, dim, sigma, datacl_buf, \
pos_buf, rescl_buf)
cl.enqueue_copy(queue, rescl, rescl_buf)
queue.finish()
result = rescl.reshape((N,dim))
t1 = time.time()
print (t1-t0)