Optimizing a reed-solomon encoder (polynomial division)

I am trying to optimize a Reed-Solomon encoder, which is in fact simply a polynomial division operation over Galois Fields 2^8 (which simply means that values wrap-around over 255). The code is in fact very very similar to what can be found here for Go: http://research.swtch.com/field

The algorithm for polynomial division used here is a synthetic division (also called Horner's method).

I tried everything: numpy, pypy, cython. The best performance I get is by using pypy with this simple nested loop:

def rsenc(msg_in, nsym, gen):
    '''Reed-Solomon encoding using polynomial division, better explained at http://research.swtch.com/field'''
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    for i in xrange(len(msg_in)):
        coef = msg_out[i]
        # coef = gf_mul(msg_out[i], gf_inverse(gen[0]))  // for general polynomial division (when polynomials are non-monic), we need to compute: coef = msg_out[i] / gen[0]
        if coef != 0: # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw)
            lcoef = gf_log[coef] # precaching

            for j in xrange(1, len(gen)): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1)
                msg_out[i + j] ^= gf_exp[lcoef + lgen[j]] # equivalent (in Galois Field 2^8) to msg_out[i+j] += msg_out[i] * gen[j]

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

Can a Python optimization wizard guide me to some clues on how to get a speedup? My goal is to get at least a speedup of 3x, but more would be awesome. Any approach or tool is accepted, as long as it is cross-platform (works at least on Linux and Windows).

Here is a small test script with some of the other alternatives I tried (the cython attempt is not included since it was slower than native python!):

import random
from operator import xor

numpy_enabled = False
try:
    import numpy as np
    numpy_enabled = True
except ImportError:
    pass

# Exponent table for 3, a generator for GF(256)
gf_exp = bytearray([1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19,
    53, 95, 225, 56, 72, 216, 115, 149, 164, 247, 2, 6, 10, 30, 34,
    102, 170, 229, 52, 92, 228, 55, 89, 235, 38, 106, 190, 217, 112,
    144, 171, 230, 49, 83, 245, 4, 12, 20, 60, 68, 204, 79, 209, 104,
    184, 211, 110, 178, 205, 76, 212, 103, 169, 224, 59, 77, 215, 98,
    166, 241, 8, 24, 40, 120, 136, 131, 158, 185, 208, 107, 189, 220,
    127, 129, 152, 179, 206, 73, 219, 118, 154, 181, 196, 87, 249, 16,
    48, 80, 240, 11, 29, 39, 105, 187, 214, 97, 163, 254, 25, 43, 125,
    135, 146, 173, 236, 47, 113, 147, 174, 233, 32, 96, 160, 251, 22,
    58, 78, 210, 109, 183, 194, 93, 231, 50, 86, 250, 21, 63, 65, 195,
    94, 226, 61, 71, 201, 64, 192, 91, 237, 44, 116, 156, 191, 218,
    117, 159, 186, 213, 100, 172, 239, 42, 126, 130, 157, 188, 223,
    122, 142, 137, 128, 155, 182, 193, 88, 232, 35, 101, 175, 234, 37,
    111, 177, 200, 67, 197, 84, 252, 31, 33, 99, 165, 244, 7, 9, 27,
    45, 119, 153, 176, 203, 70, 202, 69, 207, 74, 222, 121, 139, 134,
    145, 168, 227, 62, 66, 198, 81, 243, 14, 18, 54, 90, 238, 41, 123,
    141, 140, 143, 138, 133, 148, 167, 242, 13, 23, 57, 75, 221, 124,
    132, 151, 162, 253, 28, 36, 108, 180, 199, 82, 246] * 2 + [1])

# Logarithm table, base 3
gf_log = bytearray([0, 0, 25, 1, 50, 2, 26, 198, 75, 199, 27, 104, 51, 238, 223, # BEWARE: the first entry should be None instead of 0 because it's undefined, but for a bytearray we can't set such a value
        3, 100, 4, 224, 14, 52, 141, 129, 239, 76, 113, 8, 200, 248, 105,
        28, 193, 125, 194, 29, 181, 249, 185, 39, 106, 77, 228, 166, 114,
        154, 201, 9, 120, 101, 47, 138, 5, 33, 15, 225, 36, 18, 240, 130,
        69, 53, 147, 218, 142, 150, 143, 219, 189, 54, 208, 206, 148, 19,
        92, 210, 241, 64, 70, 131, 56, 102, 221, 253, 48, 191, 6, 139, 98,
        179, 37, 226, 152, 34, 136, 145, 16, 126, 110, 72, 195, 163, 182,
        30, 66, 58, 107, 40, 84, 250, 133, 61, 186, 43, 121, 10, 21, 155,
        159, 94, 202, 78, 212, 172, 229, 243, 115, 167, 87, 175, 88, 168,
        80, 244, 234, 214, 116, 79, 174, 233, 213, 231, 230, 173, 232, 44,
        215, 117, 122, 235, 22, 11, 245, 89, 203, 95, 176, 156, 169, 81,
        160, 127, 12, 246, 111, 23, 196, 73, 236, 216, 67, 31, 45, 164,
        118, 123, 183, 204, 187, 62, 90, 251, 96, 177, 134, 59, 82, 161,
        108, 170, 85, 41, 157, 151, 178, 135, 144, 97, 190, 220, 252, 188,
        149, 207, 205, 55, 63, 91, 209, 83, 57, 132, 60, 65, 162, 109, 71,
        20, 42, 158, 93, 86, 242, 211, 171, 68, 17, 146, 217, 35, 32, 46,
        137, 180, 124, 184, 38, 119, 153, 227, 165, 103, 74, 237, 222, 197,
        49, 254, 24, 13, 99, 140, 128, 192, 247, 112, 7])

if numpy_enabled:
    np_gf_exp = np.array(gf_exp)
    np_gf_log = np.array(gf_log)

def gf_pow(x, power):
    return gf_exp[(gf_log[x] * power) % 255]

def gf_poly_mul(p, q):
    r = [0] * (len(p) + len(q) - 1)
    lp = [gf_log[p[i]] for i in xrange(len(p))]
    for j in range(len(q)):
        lq = gf_log[q[j]]
        for i in range(len(p)):
            r[i + j] ^= gf_exp[lp[i] + lq]
    return r

def rs_generator_poly_base3(nsize, fcr=0):
    g_all = {}
    g = [1]
    g_all[0] = g_all[1] = g
    for i in range(fcr+1, fcr+nsize+1):
        g = gf_poly_mul(g, [1, gf_pow(3, i)])
        g_all[nsize-i] = g
    return g_all

# Fastest way with pypy
def rsenc(msg_in, nsym, gen):
    '''Reed-Solomon encoding using polynomial division, better explained at http://research.swtch.com/field'''
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    for i in xrange(len(msg_in)):
        coef = msg_out[i]
        # coef = gf_mul(msg_out[i], gf_inverse(gen[0]))  # for general polynomial division (when polynomials are non-monic), the usual way of using synthetic division is to divide the divisor g(x) with its leading coefficient (call it a). In this implementation, this means:we need to compute: coef = msg_out[i] / gen[0]
        if coef != 0: # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw)
            lcoef = gf_log[coef] # precaching

            for j in xrange(1, len(gen)): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1)
                msg_out[i + j] ^= gf_exp[lcoef + lgen[j]] # equivalent (in Galois Field 2^8) to msg_out[i+j] += msg_out[i] * gen[j]

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

# Alternative 1: the loops were completely changed, instead of fixing msg_out[i] and updating all subsequent i+j items, we now fixate msg_out[i+j]  and compute it at once using all couples msg_out[i] * gen[j] - msg_out[i+1] * gen[j-1] - ... since when we fixate msg_out[i+j], all previous msg_out[k] with k < i+j are already fully computed.
def rsenc_alt1(msg_in, nsym, gen):
    msg_in = bytearray(msg_in)
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    # Alternative 1
    jlist = range(1, len(gen))
    for k in xrange(1, len(msg_out)):
        for x in xrange(max(k-len(msg_in),0), len(gen)-1):
            if k-x-1 < 0: break
            msg_out[k] ^=  gf_exp[msg_out[k-x-1] + lgen[jlist[x]]]

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

# Alternative 2: a rewrite of alternative 1 with generators and reduce
def rsenc_alt2(msg_in, nsym, gen):
    msg_in = bytearray(msg_in)
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    # Alternative 1
    jlist = range(1, len(gen))
    for k in xrange(1, len(msg_out)):
        items_gen = ( gf_exp[msg_out[k-x-1] + lgen[jlist[x]]] if k-x-1 >= 0 else next(iter(())) for x in xrange(max(k-len(msg_in),0), len(gen)-1) )
        msg_out[k] ^= reduce(xor, items_gen)

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

# Alternative with Numpy
def rsenc_numpy(msg_in, nsym, gen):
    msg_in = np.array(bytearray(msg_in))
    msg_out = np.pad(msg_in, (0, nsym), 'constant')
    lgen = np_gf_log[gen]
    for i in xrange(msg_in.size):
        msg_out[i+1:i+lgen.size] ^= np_gf_exp[np.add(lgen[1:], msg_out[i])]

    msg_out[:len(msg_in)] = msg_in
    return msg_out

gf_mul_arr = [bytearray(256) for _ in xrange(256)]
gf_add_arr = [bytearray(256) for _ in xrange(256)]

# Precompute multiplication and addition tables
def gf_precomp_tables(gf_exp=gf_exp, gf_log=gf_log):
    global gf_mul_arr, gf_add_arr
    for i in xrange(256):
        for j in xrange(256):
            gf_mul_arr[i][j] = gf_exp[gf_log[i] + gf_log[j]]
            gf_add_arr[i][j] = i ^ j
    return gf_mul_arr, gf_add_arr

# Alternative with precomputation of multiplication and addition tables, inspired by zfec: https://hackage.haskell.org/package/fec-0.1.1/src/zfec/fec.c
def rsenc_precomp(msg_in, nsym, gen=None):
    msg_in = bytearray(msg_in)
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)

    for i in xrange(len(msg_in)): # [i for i in xrange(len(msg_in)) if msg_in[i] != 0]
        coef = msg_out[i]
        if coef != 0:  # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw)
            mula = gf_mul_arr[coef]
            for j in xrange(1, len(gen)): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1)
                #msg_out[i + j] = gf_add_arr[msg_out[i+j]][gf_mul_arr[coef][gen[j]]] # slower...
                #msg_out[i + j] ^= gf_mul_arr[coef][gen[j]] # faster
                msg_out[i + j] ^= mula[gen[j]] # fastest

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in # equivalent to c = mprime - b, where mprime is msg_in padded with [0]*nsym
    return msg_out

def randstr(n, size):
    '''Generate very fastly a random hexadecimal string. Kudos to jcdryer http://stackoverflow.com/users/131084/jcdyer'''
    hexstr = '%0'+str(size)+'x'
    for _ in xrange(n):
        yield hexstr % random.randrange(16**size)

# Simple test case
if __name__ == "__main__":

    # Setup functions to test
    funcs = [rsenc, rsenc_precomp, rsenc_alt1, rsenc_alt2]
    if numpy_enabled: funcs.append(rsenc_numpy)
    gf_precomp_tables()

    # Setup RS vars
    n = 255
    k = 213

    import time

    # Init the generator polynomial
    g = rs_generator_poly_base3(n)

    # Init the ground truth
    mes = 'hello world'
    mesecc_correct = rsenc(mes, n-11, g[k])

    # Test the functions
    for func in funcs:
        # Sanity check
        if func(mes, n-11, g[k]) != mesecc_correct: print func.__name__, ": output is incorrect!"
        # Time the function
        total_time = 0
        for m in randstr(1000, n):
            start = time.clock()
            func(m, n-k, g[k])
            total_time += time.clock() - start
        print func.__name__, ": total time elapsed %f seconds." % total_time

And here is the result on my machine:

With PyPy:
rsenc : total time elapsed 0.108183 seconds.
rsenc_alt1 : output is incorrect!
rsenc_alt1 : total time elapsed 0.164084 seconds.
rsenc_alt2 : output is incorrect!
rsenc_alt2 : total time elapsed 0.557697 seconds.

Without PyPy:
rsenc : total time elapsed 3.518857 seconds.
rsenc_alt1 : output is incorrect!
rsenc_alt1 : total time elapsed 5.630897 seconds.
rsenc_alt2 : output is incorrect!
rsenc_alt2 : total time elapsed 6.100434 seconds.
rsenc_numpy : output is incorrect!
rsenc_numpy : total time elapsed 1.631373 seconds

(Note: the alternatives should be correct, some index must be a bit off, but since they are slower anyway I did not try to fix them)

/UPDATE and goal of the bounty: I found a very interesting optimization trick that promises to speed up computations a lot: to precompute the multiplication table. I updated the code above with the new function rsenc_precomp(). However, there's no gain at all in my implementation, it's even a bit slower:

rsenc : total time elapsed 0.107170 seconds.
rsenc_precomp : total time elapsed 0.108788 seconds.

How can it be that arrays lookups cost more than operations like additions or xor? Why does it work in ZFEC and not in Python?

I will attribute the bounty to whoever can show me how to make this multiplication/addition lookup-tables optimization work (faster than the xor and addition operations) or who can explain to me with references or analysis why this optimization cannot work here (using Python/PyPy/Cython/Numpy etc.. I tried them all).

Solution

The following is 3x faster than pypy on my machine (0.04s vs 0.15s). Using Cython:

ctypedef unsigned char uint8_t # does not work with Microsoft's C Compiler: from libc.stdint cimport uint8_t
cimport cpython.array as array

cdef uint8_t[::1] gf_exp = bytearray([1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19,
   lots of numbers omitted for space reasons
   ...])

cdef uint8_t[::1] gf_log = bytearray([0, 0, 25, 1, 50, 2, 26, 198, 75, 199, 27, 104, 
    more numbers omitted for space reasons
    ...])

import cython

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.initializedcheck(False)
def rsenc(msg_in_r, nsym, gen_t):
    '''Reed-Solomon encoding using polynomial division, better explained at http://research.swtch.com/field'''

    cdef uint8_t[::1] msg_in = bytearray(msg_in_r) # have to copy, unfortunately - can't make a memory view from a read only object
    cdef int[::1] gen = array.array('i',gen_t) # convert list to array

    cdef uint8_t[::1] msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    cdef int j
    cdef uint8_t[::1] lgen = bytearray(gen.shape[0])
    for j in xrange(gen.shape[0]):
        lgen[j] = gf_log[gen[j]]

    cdef uint8_t coef,lcoef

    cdef int i
    for i in xrange(msg_in.shape[0]):
        coef = msg_out[i]
        if coef != 0: # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw)
            lcoef = gf_log[coef] # precaching

            for j in xrange(1, gen.shape[0]): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1)
                msg_out[i + j] ^= gf_exp[lcoef + lgen[j]] # equivalent (in Galois Field 2^8) to msg_out[i+j] -= msg_out[i] * gen[j]

    # Recopy the original message bytes
    msg_out[:msg_in.shape[0]] = msg_in
    return msg_out

It is just your fastest version with static types (and checking the html from cython -a until the loops aren't highlighted in yellow).

A few brief notes:

Cython prefers x.shape[0] to len(shape)
Defining the memoryviews as [::1] promises they are continuous in memory, which helps
initializedcheck(False) is necesary for avoiding lots of existance checks on the globally defined gf_exp and gf_log. (You might find you can speed up your basic Python/PyPy code by creating a local variable reference for these and using that istead)
I had to copy a couple of the input arguments. Cython can't make a memoryview from a readonly object (in this case msg_in, a string. I could probably have just made it a char* though). Also gen (a list) needs to be in something with fast element access.

Other than that it's all fairly straight-forward. (I haven't tried any variations of it having got it faster). I'm really quite impressed at how well PyPy does.