I cannot understand the reason why the copy speed varies so much in these examples. I get Almost indetical images from them. Calculation time is also faster on 'faster' variant
Didn't help: move all the variables of the 'slow' variant inside the kernel, various memory flags almost do not change the result.
It turns out the problem is in the kernel, but what exactly is the problem?
WARNING! I pasted the whole file
import pyopencl as cl
import numpy as np
from PIL import Image
import time
Faster variant. It takes ~0.15s to copy from buffer
width = 800
height = 800
X = 0
Y = 0
R = 2
maxiter = 80000
xmin = X - R
xmax = X + R
ymin = Y - R
ymax = Y + R
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
r1 = np.linspace(xmin, xmax, width, dtype=np.float64)
r2 = np.linspace(ymin, ymax, height, dtype=np.float64)
q = r1 + r2[:, None] * 1j
q = np.ravel(q)
output = np.empty(width*height, dtype=np.uint8)
mf = cl.mem_flags
q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes)
prg = cl.Program(ctx, """
__kernel void mandelbrot(__global double2 *q,
__global uchar *output, ushort const maxiter)
{
int gid = get_global_id(0);
double nreal, real = 0;
double imag = 0;
output[gid] = 0.0;
int curiter = 0;
for(curiter = 0; curiter < maxiter; curiter++) {
nreal = real*real - imag*imag + q[gid].x;
imag = 2* real*imag + q[gid].y;
real = nreal;
if (real*real + imag*imag > 4.0f){
break;
}
}
if (curiter < maxiter) {
output[gid] = curiter*64;
}
}
""").build()
prg.mandelbrot(queue, output.shape, None, q_opencl, output_opencl, np.uint16(maxiter))
t0 = time.time()
cl.enqueue_copy(queue, output, output_opencl).wait()
print(time.time()-t0, 'copy')
output = output.reshape((width, height))
Slower variant. It takes ~0.78s to copy from buffer
size = (800, 800)
X = 0
Y = 0
R = 2
maxiter = 80000
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
output = np.empty(size[0]*size[1], dtype=np.uint8)
mf = cl.mem_flags
output_cl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes)
prg = cl.Program(ctx, """
__kernel void mandelbrot(
__global uchar *out,
int width,
int height,
double real,
double imag,
double const radius,
int const maxiter) {
int id = get_global_id(0);
int i = id % width;
int j = id / width;
double window_radius = (width < height) ? width : height;
double x0 = real + radius * (2 * i - (float)width) / window_radius;
double y0 = imag - radius * (2 * j - (float)height) / window_radius;
double x = 0;
double y = 0;
int n = 0;
double x_temp = 0;
for(n = 0; n < maxiter; n++)
{
x_temp = x*x - y*y + x0;
y = 2 * x*y + y0;
x = x_temp;
if (x*x + y*y > 4.0f){
break;
}
}
if (n < maxiter) {
out[id] = n*64;
}
else {
out[id] = 0;
}
}
""").build()
prg.mandelbrot(queue, output.shape, None,
output_cl,
np.int32(size[0]),
np.int32(size[1]),
np.float64(X),
np.float64(Y),
np.float64(R),
np.int32(maxiter),
)
t0 = time.time()
cl.enqueue_copy(queue, output, output_cl).wait()
print(time.time() - t0, 'copy')
output = output.reshape((size[1], size[0]))
I cannot understand the reason why the copy speed varies so much in these examples.
The reason is: you're not measuring the time of the copy command.
You say "prg.mandelbrot() executes the kernel and do all the calculations" - that's not what it does. It enqueues the kernel. Then you enqueue the copy command, and then you call wait()
. Some implementations start executing immediately after enqueue, but some do not start until you call clFinish/clFlush/clWaitForEvents
(the last is what PyOpenCL's Event.wait()
does - in your code, cl.enqueue_copy()
returns an event).
The problem is that you're doing the beginner mistake of trying to measure OpenCL (GPU) time using host CPU time. It never works. You have to measure on-GPU time via OpenCL event profiling. Here's how to do it.