Search code examples
pythonnumpyperformanceoptimization

How to optimize the following structure as a vectorized operation


My end goal is to get c_indices as described here. However, I'm unable to find any good optimization to increase the speed. Is it even possible to perform some kind of vectorization to reduce the time complexity?

_c = np.random.randint(0,1000, (10000))

c_starts = np.concatenate([[0], np.cumsum(_c[:-1])])
c_indices =  np.concatenate([np.tile(np.arange(start, start + size), _c[i])
                              for i, (start, size) in enumerate(zip(c_starts, _c))])

for example for a given _c = [3, 4, 2], I would expect the following output

>>> _c
[3, 4, 2]
>>> c_starts
array([0, 3, 7])
>>> c_indices
array([0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 5, 6, 3, 4, 5, 6, 3, 4,
       5, 6, 3, 4, 5, 6, 7, 8, 7, 8])

#based on _c = [3, 4, 2]
#[0, 1, 2] -> repeated 3 times 
#[3, 4, 5, 6] -> repeated 4 times 
#[7, 8] -> repeated 2 times 




Solution

  • I have numba solution that takes about 60% of the original time (at least on my machine). It also should significantly lower memory consumption.

    import numpy as np
    import numba
    
    @numba.njit("int32[:](int32[:], int32[:])")
    def simple_tiling_numba(starts, sizes):
        tiled_starts = np.zeros(len(sizes) + 1, dtype=np.int64)
        tiled_starts[1:] = np.cumsum(sizes ** 2)
    
        result = np.full(tiled_starts[-1], 0, dtype=np.int32)
        for idx in range(len(starts)):
            start = starts[idx]
            size = sizes[idx]
            an_array = np.arange(start, start+size)
            if size > 0:
                for idx2 in range(tiled_starts[idx], tiled_starts[idx+1], size):
                    result[idx2:idx2+size] = an_array
        return result
    
    
    def main():
        _c = np.random.randint(0, 1000, (10000))
    
        c_starts = np.concatenate([[0], np.cumsum(_c[:-1])])
        c_indices = np.concatenate([np.tile(np.arange(start, start + size), _c[i])
                                    for i, (start, size) in enumerate(zip(c_starts, _c))])
        c_indices2 = simple_tiling_numba(c_starts, sizes=_c)
        assert (np.all(c_indices == c_indices2))
    
    
    if __name__ == '__main__':
        main()
    

    Tried parallelizing and it improves performance compared to non-parallel numba but only slightly (10-20%) at the cost of 2x time CPU consumption:

    @numba.njit("int32[:](int32[:], int32[:])", parallel=True)
    def simple_tiling_numba_parallel(starts, sizes):
        tiled_starts = np.zeros(len(sizes) + 1, dtype=np.int64)
        tiled_starts[1:] = np.cumsum(sizes ** 2)
    
        result = np.full(tiled_starts[-1], 0, dtype=np.int32)
        for idx in numba.prange(len(starts)):
            start = starts[idx]
            size = sizes[idx]
            an_array = np.arange(start, start+size)
            if size > 0:
                for idx2 in range(tiled_starts[idx], tiled_starts[idx+1], size):
                    result[idx2:idx2+size] = an_array
        return result