I'm mainly a very high level programmer so thinking about things like CPU locality is very new to me.
I'm working on a basic bilinear demosaic (for RGGB sensor data) and I've got the algorithm right (judging by the results) but it's not performing as well as I'd hoped (~210Mpix/s).
Here's my code (the input is a 4640x3472 image with a single channel of RGGB):
def get_bilinear_debayer(input_raw, print_nest=False):
x, y, c = Var(), Var(), Var()
# Clamp and move to 32 bit for lots of space for averaging.
input = Func()
input[x,y] = cast(
UInt(32),
input_raw[
clamp(x,0,input_raw.width()-1),
clamp(y,0,input_raw.height()-1)]
)
# Interpolate vertically
vertical = Func()
vertical[x,y] = (input[x,y-1] + input[x,y+1])/2
# Interpolate horizontally
horizontal = Func()
horizontal[x,y] = (input[x-1,y] + input[x+1,y])/2
# Interpolate on diagonals
diagonal_average = Func()
diagonal_average[x, y] = (
input[x+1,y-1] +
input[x+1,y+1] +
input[x-1,y-1] +
input[x-1,y+1])/4
# Interpolate on adjacents
adjacent_average = Func()
adjacent_average[x, y] = (horizontal[x,y] + vertical[x,y])/2
red, green, blue = Func(), Func(), Func()
# Calculate the red channel
red[x, y, c] = select(
# Red photosite
c == 0, input[x, y],
# Green photosite
c == 1, select(x%2 == 0, vertical[x,y],
horizontal[x,y]),
# Blue photosite
diagonal_average[x,y]
)
# Calculate the blue channel
blue[x, y, c] = select(
# Blue photosite
c == 2, input[x, y],
# Green photosite
c == 1, select(x%2 == 1, vertical[x,y],
horizontal[x,y]),
# Red photosite
diagonal_average[x,y]
)
# Calculate the green channel
green[x, y, c] = select(
# Green photosite
c == 1, input[x,y],
# Red/Blue photosite
adjacent_average[x,y]
)
# Switch color interpolator based on requested color.
# Specify photosite as third argument, calculated as [x, y, z] = (0, 0, 0), (0, 1, 1), (1, 0, 1), (1, 1, 2)
# Happily works out to a sum of x mod 2 and y mod 2.
debayer = Func()
debayer[x, y, c] = select(c == 0, red[x, y, x%2 + y%2],
c == 1, green[x, y, x%2 + y%2],
blue[x, y, x%2 + y%2])
# Scheduling
x_outer, y_outer, x_inner, y_inner, tile_index = Var(), Var(), Var(), Var(), Var()
bits = input_raw.get().type().bits
output = Func()
# Cast back to the original colour space
output[x,y,c] = cast(UInt(bits), debayer[x,y,c])
# Reorder so that colours are calculated in order (red runs, then green, then blue)
output.reorder_storage(c, x, y)
# Tile in 128x128 squares
output.tile(x, y, x_outer, y_outer, x_inner, y_inner, 128, 128)
# Vectorize based on colour
output.bound(c, 0, 3)
output.vectorize(c)
# Fuse and parallelize
output.fuse(x_outer, y_outer, tile_index)
output.parallel(tile_index)
# Debugging
if print_nest:
output.print_loop_nest()
debayer.print_loop_nest()
red.print_loop_nest()
green.print_loop_nest()
blue.print_loop_nest()
return output
Honestly I have no idea what I'm doing here and I'm too new to this to have any clue where or what to look at.
Any advice on how to improve the scheduling is helpful. I'm still learning but feedback is hard to find.
The schedule I have is the best I've been able to do but it's pretty much entirely trial and error.
EDIT: I added an extra 30Mpix/s by doing the whole adjacent average summation in the function directly and by vectorizing on x_inner instead of colour.
EDIT: New schedule:
# Set input bounds.
output.bound(x, 0, (input_raw.width()/2)*2)
output.bound(y, 0, (input_raw.height()/2)*2)
output.bound(c, 0, 3)
# Reorder so that colours are calculated in order (red runs, then green, then blue)
output.reorder_storage(c, x, y)
output.reorder(c, x, y)
# Tile in 128x128 squares
output.tile(x, y, x_outer, y_outer, x_inner, y_inner, 128, 128)
output.unroll(x_inner, 2).unroll(y_inner,2)
# Vectorize based on colour
output.unroll(c)
output.vectorize(c)
# Fuse and parallelize
output.fuse(x_outer, y_outer, tile_index)
output.parallel(tile_index)
EDIT: Final schedule that's now beating (640MP/s) the Intel Performance Primitive benchmark that was run on a CPU twice as powerful as mine:
output = Func()
# Cast back to the original colour space
output[x,y,c] = cast(UInt(bits), debayer[x,y,c])
# Set input bounds.
output.bound(x, 0, (input_raw.width()/2)*2)
output.bound(y, 0, (input_raw.height()/2)*2)
output.bound(c, 0, 3)
# Tile in 128x128 squares
output.tile(x, y, x_outer, y_outer, x_inner, y_inner, 128, 128)
output.unroll(x_inner, 2).unroll(y_inner, 2)
# Vectorize based on colour
output.vectorize(x_inner, 16)
# Fuse and parallelize
output.fuse(x_outer, y_outer, tile_index)
output.parallel(tile_index)
target = Target()
target.arch = X86
target.os = OSX
target.bits = 64
target.set_feature(AVX)
target.set_feature(AVX2)
target.set_feature(SSE41)
output.compile_jit(target)
Make sure that you are using unroll(c) to make the per-channel select logic optimize away. Unrolling by 2 in x and y will also help:
output.unroll(x, 2).unroll(y,2)
The goal there is to optimize out the select logic between even/odd rows and columns. In order to take full advantage of that, you'll likely also need to tell Halide that the min and extent are a multiple of 2:
output.output_buffer().set_bounds(0,
(f.output_buffer().min(0) / 2) * 2,
(output.output_buffer().extent(0) / 2) * 2)
output.output_buffer().set_bounds(1,
(f.output_buffer().min(1) / 2) * 2,
(output.output_buffer().extent(1) / 2) * 2)
Though it may be worth stating even more stringent constraints, such as using 128 instead of 2 to assert multiples of the tile size or just hardwiring the min and extent to reflect the actual sensor parameters if you are only supporting a single camera.