I am working on optimizing my Mandelbrot renderer in pyOpenCL and want to split the iterations in chunks so i can better utilize my GPU.
Example with max iterations=1000 and 2 "chunks":
1. Run the mandelbrot escape algorithm for iterations 0-500.
2. Save the needed iterations for every point where iterations < 500 and run again for all other points with iterations from 500 - 1000.
The first loop works like expected but every chunk after that leads to wrong results. I'd really like to be more specific but i don't know where the real problem lies (starring at the code for > 2 days now).
I suspect something going wrong when copying the old x,y (real, imaginary) parts from the kernel, but i am out of ideas how to debug it.
I get the same results running on my GPU and CPU so i'd guess its nothing GPU specific.
Example image with iterations=2000 and 10 chunks:
this is almost only the first chunk (plus some few "wrong" pixel).
All done in one chunk (iterations=200 and 1 chunk):
And the expected result with iterations=2000 and chunks = 1:
import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal
def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
mf = cl.mem_flags
cl_queue = cl.CommandQueue(ctx)
# build program
code = """
#if real_t == double
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
kernel void mandel(
__global real_t *coords,
__global uint *output,
__global real_t *output_coord,
const uint max_iter,
const uint start_iter
){
uint id = get_global_id(0);
real_t2 my_coords = vload2(id, coords);
real_t x = my_coords.x;
real_t y = my_coords.y;
uint iter = 0;
for(iter=start_iter; iter<max_iter; ++iter){
if(x*x + y*y > 4.0f){
break;
}
real_t xtemp = x*x - y*y + my_coords.x;
y = 2*x*y + my_coords.y;
x = xtemp;
}
// copy the current x,y pair back
real_t2 val = (real_t2){x, y};
vstore2(val, id, output_coord);
output[id] = iter;
}
"""
_cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))
# Calculate the "viewport".
x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
y1 = y + ((Decimal(2) * zoom)/Decimal(2.))
# Create index map in x,y pairs
xx = np.arange(0, width, 1, dtype=np.uint32)
yy = np.arange(0, height, 1, dtype=np.uint32)
index_map = np.dstack(np.meshgrid(xx, yy))
# and local "coordinates" (real, imaginary parts)
coord_map = np.ndarray(index_map.shape, dtype=_nptype)
coord_map[:] = index_map
coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
coord_map[:] += (_nptype(x0), _nptype(y0))
coord_map = coord_map.flatten()
index_map = index_map.flatten().astype(dtype=np.uint32)
# Create input and output buffer
buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
buffer_out_coords_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out_coords.nbytes)
# 2D Buffer to collect the iterations needed per pixel
#iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))
start_max_iter = 0
to_do = coord_map.size / 2
steps_size = int(max_iter / float(iter_steps))
while to_do > 0 and start_max_iter < max_iter:
end_max_iter = min(max_iter, start_max_iter + steps_size )
print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)
# copy x/y pairs to device
cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()
# and finally call the ocl function
prg.mandel(cl_queue, (to_do,), None,
buffer_in_cl,
buffer_out_cl,
buffer_out_coords_cl,
np.uint32(end_max_iter),
np.uint32(start_max_iter)
).wait()
# Copy the output back
cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()
# Get indices of "found" escapes
done = np.where(buffer_out[:to_do]<end_max_iter)[0]
# and write the iterations to the coresponding cell
index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
tmp = index_reshaped[done]
iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]
#iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]
# Get the indices of non escapes
undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
# and write them back to our "job" maps for the next loop
tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
coord_map[:undone.size*2] = tmp[undone].flatten()
index_map[:undone.size*2] = index_reshaped[undone].flatten()
to_do = undone.size
start_max_iter = end_max_iter
print "%i done. %i unknown" % (done.size, undone.size)
# simple coloring by modulo 255 on the iter_map
return (iter_map % 255).astype(np.uint8).reshape((height, width))
if __name__ == '__main__':
ctx = cl.create_some_context(interactive=True)
img = mandel(ctx,
x=Decimal("-0.7546546453361122021732941811"),
y=Decimal("0.05020518634419688663435986387"),
zoom=Decimal("0.0002046859427855630601247281079"),
max_iter=2000,
iter_steps=1,
width=500,
height=400,
use_double=False
)
Image.fromarray(img).show()
edit: Here is another version where the real/imaginary parts never leave the GPU memory.
The results are the same.
I am completely out of ideas.
You are using the updated "co-ords" from buffer_out_coords
instead of the original coords as the c value when doing the Z squared plus c
calculation. The buffer_out_coords contains current Z
values rather than the original c
co-ord, so these are the values you want to begin from but you also want the original co-ords.
Theres only 4 changes you need:
Im not getting the same output as you were with the code you presented so Im not sure if there is something else wrong here or not but this change gave me consistent output:
1 chunk = 153052 unknown
PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 500 for 200000 numbers
46948 done. 153052 unknown
5 chunks = 153052 unknown
PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 100 for 200000 numbers
0 done. 200000 unknown
Iterations from iteration 100 to 200 for 200000 numbers
11181 done. 188819 unknown
Iterations from iteration 200 to 300 for 188819 numbers
9627 done. 179192 unknown
Iterations from iteration 300 to 400 for 179192 numbers
16878 done. 162314 unknown
Iterations from iteration 400 to 500 for 162314 numbers
9262 done. 153052 unknown
Heres the code:
import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal
def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
mf = cl.mem_flags
cl_queue = cl.CommandQueue(ctx)
# build program
code = """
#if real_t == double
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
kernel void mandel(
__global real_t *coords,
__global uint *output,
__global real_t *output_coord,
const uint max_iter,
const uint start_iter
){
uint id = get_global_id(0);
real_t2 my_coords = vload2(id, coords);
real_t2 my_value_coords = vload2(id, output_coord);
real_t x = my_value_coords.x;
real_t y = my_value_coords.y;
uint iter = 0;
for(iter=start_iter; iter<max_iter; ++iter){
if(x*x + y*y > 4.0f){
break;
}
real_t xtemp = x*x - y*y + my_coords.x;
y = 2*x*y + my_coords.y;
x = xtemp;
}
// copy the current x,y pair back
real_t2 val = (real_t2){x, y};
vstore2(val, id, output_coord);
output[id] = iter;
}
"""
_cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))
# Calculate the "viewport".
x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
y1 = y + ((Decimal(2) * zoom)/Decimal(2.))
# Create index map in x,y pairs
xx = np.arange(0, width, 1, dtype=np.uint32)
yy = np.arange(0, height, 1, dtype=np.uint32)
index_map = np.dstack(np.meshgrid(xx, yy))
# and local "coordinates" (real, imaginary parts)
coord_map = np.ndarray(index_map.shape, dtype=_nptype)
coord_map[:] = index_map
coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
coord_map[:] += (_nptype(x0), _nptype(y0))
coord_map = coord_map.flatten()
index_map = index_map.flatten().astype(dtype=np.uint32)
# Create input and output buffer
buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
buffer_out_coords_cl = cl.Buffer(ctx, mf.READ_WRITE, size=buffer_out_coords.nbytes)
# 2D Buffer to collect the iterations needed per pixel
#iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))
start_max_iter = 0
to_do = coord_map.size / 2
steps_size = int(max_iter / float(iter_steps))
while to_do > 0 and start_max_iter < max_iter:
end_max_iter = min(max_iter, start_max_iter + steps_size )
print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)
# copy x/y pairs to device
cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()
cl.enqueue_copy(cl_queue, buffer_out_coords_cl, buffer_out_coords[:to_do*2]).wait()
# and finally call the ocl function
prg.mandel(cl_queue, (to_do,), None,
buffer_in_cl,
buffer_out_cl,
buffer_out_coords_cl,
np.uint32(end_max_iter),
np.uint32(start_max_iter)
).wait()
# Copy the output back
cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()
# Get indices of "found" escapes
done = np.where(buffer_out[:to_do]<end_max_iter)[0]
# and write the iterations to the coresponding cell
index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
tmp = index_reshaped[done]
iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]
#iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]
# Get the indices of non escapes
undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
# and write them back to our "job" maps for the next loop
tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
buffer_out_coords[:undone.size*2] = tmp[undone].flatten()
tmp = coord_map[:to_do*2].reshape((to_do, 2))
coord_map[:undone.size*2] = tmp[undone].flatten()
index_map[:undone.size*2] = index_reshaped[undone].flatten()
to_do = undone.size
start_max_iter = end_max_iter
print "%i done. %i unknown" % (done.size, undone.size)
# simple coloring by modulo 255 on the iter_map
return (iter_map % 255).astype(np.uint8).reshape((height, width))
if __name__ == '__main__':
ctx = cl.create_some_context(interactive=True)
img = mandel(ctx,
x=Decimal("-0.7546546453361122021732941811"),
y=Decimal("0.05020518634419688663435986387"),
zoom=Decimal("0.0002046859427855630601247281079"),
max_iter=2000,
iter_steps=1,
width=500,
height=400,
use_double=False
)
Image.fromarray(img).show()