I am very new to opencl and trying my first program. I implemented a simple sinc filtering of waveforms. The code works, however i have two questions:
UPDATE:
None
crashes(1,600)
and IntelHD (1,50)
lets me go up to some 6400 rows. However for larger size it crashes on GPU and IntelHD just freezes and does nothing ( 0% on resource monitor)2.I have Intel HD4600 and Nvidia K1100M GPU available, however the Intel is ~2 times faster. I understand partially this is due to the fact that I don't need to copy my arrays to internal Intel memory different from my external GPU. However I expected marginal difference. Is this normal or should my code be better optimized to use on GPU ? (resolved)
Thanks for your help !!
from __future__ import absolute_import, print_function
import numpy as np
import pyopencl as cl
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
import matplotlib.pyplot as plt
def resample_opencl(y,key='GPU'):
#
# selecting to run on GPU or CPU
#
newlen = 1200
my_platform = cl.get_platforms()[0]
device =my_platform.get_devices()[0]
for found_platform in cl.get_platforms():
if (key == 'GPU') and (found_platform.name == 'NVIDIA CUDA'):
my_platform = found_platform
device =my_platform.get_devices()[0]
print("using GPU")
#
#Create context for GPU/CPU
#
ctx = cl.Context([device])
#
# Create queue for each kernel execution
#
queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
# queue = cl.CommandQueue(ctx)
prg = cl.Program(ctx, """
__kernel void resample(
int M,
__global const float *y_g,
__global float *res_g)
{
int row = get_global_id(0);
int col = get_global_id(1);
int gs = get_global_size(1);
__private float tmp,tmp2,x;
__private float t;
t = (float)(col)/2+1;
tmp=0;
tmp2=0;
for (int i=0; i<M ; i++)
{
x = (float)(i+1);
tmp2 = (t- x)*3.14159;
if (t == x) {
tmp += y_g[row*M + i] ;
}
else
tmp += y_g[row*M +i] * sin(tmp2)/tmp2;
}
res_g[row*gs + col] = tmp;
}
""").build()
mf = cl.mem_flags
y_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
res = np.zeros((np.shape(y)[0],newlen)).astype(np.float32)
res_g = cl.Buffer(ctx, mf.WRITE_ONLY, res.nbytes)
M = np.array(600).astype(np.int32)
prg.resample(queue, res.shape, (1,200),M, y_g, res_g)
event = cl.enqueue_copy(queue, res, res_g)
print("success")
event.wait()
return res,event
if __name__ == "__main__":
#
# this is the number i need to increase ( up to some 100 000)
numrows = 2000
Gaussian = lambda t : 10 * np.exp(-(t - 50)**2 / (2. * 2**2))
x = np.linspace(1, 101, 600, endpoint=False).astype(np.float32)
t = np.linspace(1, 101, 1200, endpoint=False).astype(np.float32)
y= np.zeros(( numrows,np.size(x)))
y[:] = Gaussian(x).astype(np.float32)
y = y.astype(np.float32)
res,event = resample_opencl(y,'GPU')
print ("OpenCl GPU profiler",(event.profile.end-event.profile.start)*1e-9)
#
# test plot if it worked
#
plt.figure()
plt.plot(x,y[1,:],'+')
plt.plot(t,res[1,:])
I think I figured out the issue:
IntelHd : turning off profiling fixes everything. Can run the code without any issues.
K1100M GPU still crashes but I suspect that this might be the timeout issue as I am using the same video card on my display.