Search code examples
pythonnumpyopenclmandelbrotpyopencl

Failing to render mandelbrot in pyopencl


I am working on optimizing my Mandelbrot renderer in pyOpenCL and want to split the iterations in chunks so i can better utilize my GPU.
Example with max iterations=1000 and 2 "chunks":
1. Run the mandelbrot escape algorithm for iterations 0-500.
2. Save the needed iterations for every point where iterations < 500 and run again for all other points with iterations from 500 - 1000.

The first loop works like expected but every chunk after that leads to wrong results. I'd really like to be more specific but i don't know where the real problem lies (starring at the code for > 2 days now).
I suspect something going wrong when copying the old x,y (real, imaginary) parts from the kernel, but i am out of ideas how to debug it.
I get the same results running on my GPU and CPU so i'd guess its nothing GPU specific.

Example image with iterations=2000 and 10 chunks:
enter image description here

this is almost only the first chunk (plus some few "wrong" pixel).
All done in one chunk (iterations=200 and 1 chunk):
enter image description here

And the expected result with iterations=2000 and chunks = 1:
enter image description here

import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal

def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
    mf = cl.mem_flags
    cl_queue = cl.CommandQueue(ctx)
    # build program
    code = """
    #if real_t == double
        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
    #endif
    kernel void mandel(
        __global real_t *coords,
        __global uint *output,
        __global real_t *output_coord,
        const uint max_iter,
        const uint start_iter    
    ){
        uint id = get_global_id(0);         
        real_t2 my_coords = vload2(id, coords);           
        real_t x = my_coords.x;
        real_t y = my_coords.y;
        uint iter = 0;
        for(iter=start_iter; iter<max_iter; ++iter){
            if(x*x + y*y > 4.0f){
                break;
            }
            real_t xtemp = x*x - y*y + my_coords.x;
            y = 2*x*y + my_coords.y;
            x = xtemp;
        }
        // copy the current x,y pair back
        real_t2 val = (real_t2){x, y};
        vstore2(val, id, output_coord);
        output[id] = iter;
    }        
    """
    _cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
    prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))

    # Calculate the "viewport".
    x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
    y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
    x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
    y1 = y + ((Decimal(2) * zoom)/Decimal(2.))

    # Create index map in x,y pairs
    xx = np.arange(0, width, 1, dtype=np.uint32)
    yy = np.arange(0, height, 1, dtype=np.uint32)
    index_map = np.dstack(np.meshgrid(xx, yy))
    # and local "coordinates" (real, imaginary parts)
    coord_map = np.ndarray(index_map.shape, dtype=_nptype)
    coord_map[:] = index_map
    coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
    coord_map[:] += (_nptype(x0), _nptype(y0))
    coord_map = coord_map.flatten()
    index_map = index_map.flatten().astype(dtype=np.uint32)
    # Create input and output buffer
    buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
    buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
    buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
    buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
    buffer_out_coords_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out_coords.nbytes)
    # 2D Buffer to collect the iterations needed per pixel 
    #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
    iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))

    start_max_iter = 0
    to_do = coord_map.size / 2
    steps_size = int(max_iter / float(iter_steps))
    while to_do > 0 and start_max_iter < max_iter:
        end_max_iter = min(max_iter, start_max_iter + steps_size )
        print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)

        # copy x/y pairs to device 
        cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()        
        # and finally call the ocl function
        prg.mandel(cl_queue, (to_do,), None,
            buffer_in_cl,                   
            buffer_out_cl,
            buffer_out_coords_cl,
            np.uint32(end_max_iter),
            np.uint32(start_max_iter)
        ).wait()
        # Copy the output back
        cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
        cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()

        # Get indices of "found" escapes
        done = np.where(buffer_out[:to_do]<end_max_iter)[0]
        # and write the iterations to the coresponding cell
        index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
        tmp = index_reshaped[done]
        iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]        
        #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]        

        # Get the indices of non escapes
        undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
        # and write them back to our "job" maps for the next loop
        tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
        coord_map[:undone.size*2] = tmp[undone].flatten()
        index_map[:undone.size*2] = index_reshaped[undone].flatten()

        to_do = undone.size
        start_max_iter = end_max_iter
        print "%i done. %i unknown" % (done.size, undone.size)                            

    # simple coloring by modulo 255 on the iter_map
    return (iter_map % 255).astype(np.uint8).reshape((height, width))


if __name__ == '__main__':
    ctx = cl.create_some_context(interactive=True)
    img = mandel(ctx,
          x=Decimal("-0.7546546453361122021732941811"),
          y=Decimal("0.05020518634419688663435986387"),
          zoom=Decimal("0.0002046859427855630601247281079"),
          max_iter=2000,
          iter_steps=1,
          width=500,
          height=400,
          use_double=False
    )
    Image.fromarray(img).show()

edit: Here is another version where the real/imaginary parts never leave the GPU memory.
The results are the same.
I am completely out of ideas.


Solution

  • You are using the updated "co-ords" from buffer_out_coords instead of the original coords as the c value when doing the Z squared plus c calculation. The buffer_out_coords contains current Z values rather than the original c co-ord, so these are the values you want to begin from but you also want the original co-ords.

    Theres only 4 changes you need:

    • make buffer_out_coords_cl READ_WRITE
    • copy buffer_out_coords into buffer_out_coords_cl before each run
    • filter buffer_out_coords and coord_map by "undone"
    • in the opencl code, load the start x and y from output_coord rather than coords

    Im not getting the same output as you were with the code you presented so Im not sure if there is something else wrong here or not but this change gave me consistent output:

    1 chunk = 153052 unknown

    PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
    Iterations from iteration 0 to 500 for 200000 numbers
    46948 done. 153052 unknown
    

    5 chunks = 153052 unknown

    PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
    Iterations from iteration 0 to 100 for 200000 numbers
    0 done. 200000 unknown
    Iterations from iteration 100 to 200 for 200000 numbers
    11181 done. 188819 unknown
    Iterations from iteration 200 to 300 for 188819 numbers
    9627 done. 179192 unknown
    Iterations from iteration 300 to 400 for 179192 numbers
    16878 done. 162314 unknown
    Iterations from iteration 400 to 500 for 162314 numbers
    9262 done. 153052 unknown
    

    Heres the code:

    import pyopencl as cl
    import numpy as np
    from PIL import Image
    from decimal import Decimal
    
    def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
        mf = cl.mem_flags
        cl_queue = cl.CommandQueue(ctx)
        # build program
        code = """
        #if real_t == double
            #pragma OPENCL EXTENSION cl_khr_fp64 : enable
        #endif
        kernel void mandel(
            __global real_t *coords,
            __global uint *output,
            __global real_t *output_coord,
            const uint max_iter,
            const uint start_iter    
        ){
            uint id = get_global_id(0);         
            real_t2 my_coords = vload2(id, coords);           
            real_t2 my_value_coords = vload2(id, output_coord);           
            real_t x = my_value_coords.x;
            real_t y = my_value_coords.y;
            uint iter = 0;
            for(iter=start_iter; iter<max_iter; ++iter){
                if(x*x + y*y > 4.0f){
                    break;
                }
                real_t xtemp = x*x - y*y + my_coords.x;
                y = 2*x*y + my_coords.y;
                x = xtemp;
            }
            // copy the current x,y pair back
            real_t2 val = (real_t2){x, y};
            vstore2(val, id, output_coord);
            output[id] = iter;
        }        
        """
        _cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
        prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))
    
        # Calculate the "viewport".
        x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
        y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
        x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
        y1 = y + ((Decimal(2) * zoom)/Decimal(2.))
    
        # Create index map in x,y pairs
        xx = np.arange(0, width, 1, dtype=np.uint32)
        yy = np.arange(0, height, 1, dtype=np.uint32)
        index_map = np.dstack(np.meshgrid(xx, yy))
        # and local "coordinates" (real, imaginary parts)
        coord_map = np.ndarray(index_map.shape, dtype=_nptype)
        coord_map[:] = index_map
        coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
        coord_map[:] += (_nptype(x0), _nptype(y0))
        coord_map = coord_map.flatten()
        index_map = index_map.flatten().astype(dtype=np.uint32)
        # Create input and output buffer
        buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
        buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
        buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
        buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
        buffer_out_coords_cl = cl.Buffer(ctx, mf.READ_WRITE, size=buffer_out_coords.nbytes)
        # 2D Buffer to collect the iterations needed per pixel 
        #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
        iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))
    
        start_max_iter = 0
        to_do = coord_map.size / 2
        steps_size = int(max_iter / float(iter_steps))
        while to_do > 0 and start_max_iter < max_iter:
            end_max_iter = min(max_iter, start_max_iter + steps_size )
            print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)
    
            # copy x/y pairs to device 
            cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()        
            cl.enqueue_copy(cl_queue, buffer_out_coords_cl, buffer_out_coords[:to_do*2]).wait()        
            # and finally call the ocl function
            prg.mandel(cl_queue, (to_do,), None,
                buffer_in_cl,                   
                buffer_out_cl,
                buffer_out_coords_cl,
                np.uint32(end_max_iter),
                np.uint32(start_max_iter)
            ).wait()
            # Copy the output back
            cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
            cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()
    
            # Get indices of "found" escapes
            done = np.where(buffer_out[:to_do]<end_max_iter)[0]
            # and write the iterations to the coresponding cell
            index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
            tmp = index_reshaped[done]
            iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]        
            #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]        
    
            # Get the indices of non escapes
            undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
            # and write them back to our "job" maps for the next loop
            tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
            buffer_out_coords[:undone.size*2] = tmp[undone].flatten()
            tmp = coord_map[:to_do*2].reshape((to_do, 2))
            coord_map[:undone.size*2] = tmp[undone].flatten()
            index_map[:undone.size*2] = index_reshaped[undone].flatten()
    
            to_do = undone.size
            start_max_iter = end_max_iter
            print "%i done. %i unknown" % (done.size, undone.size)                            
    
        # simple coloring by modulo 255 on the iter_map
        return (iter_map % 255).astype(np.uint8).reshape((height, width))
    
    
    if __name__ == '__main__':
        ctx = cl.create_some_context(interactive=True)
        img = mandel(ctx,
              x=Decimal("-0.7546546453361122021732941811"),
              y=Decimal("0.05020518634419688663435986387"),
              zoom=Decimal("0.0002046859427855630601247281079"),
              max_iter=2000,
              iter_steps=1,
              width=500,
              height=400,
              use_double=False
        )
        Image.fromarray(img).show()