How to optimize the following structure as a vectorized operation

My end goal is to get c_indices as described here. However, I'm unable to find any good optimization to increase the speed. Is it even possible to perform some kind of vectorization to reduce the time complexity?

_c = np.random.randint(0,1000, (10000))

c_starts = np.concatenate([[0], np.cumsum(_c[:-1])])
c_indices =  np.concatenate([np.tile(np.arange(start, start + size), _c[i])
                              for i, (start, size) in enumerate(zip(c_starts, _c))])

for example for a given _c = [3, 4, 2], I would expect the following output

>>> _c
[3, 4, 2]
>>> c_starts
array([0, 3, 7])
>>> c_indices
array([0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 5, 6, 3, 4, 5, 6, 3, 4,
       5, 6, 3, 4, 5, 6, 7, 8, 7, 8])

#based on _c = [3, 4, 2]
#[0, 1, 2] -> repeated 3 times 
#[3, 4, 5, 6] -> repeated 4 times 
#[7, 8] -> repeated 2 times

Solution

I have numba solution that takes about 60% of the original time (at least on my machine). It also should significantly lower memory consumption.

import numpy as np
import numba

@numba.njit("int32[:](int32[:], int32[:])")
def simple_tiling_numba(starts, sizes):
    tiled_starts = np.zeros(len(sizes) + 1, dtype=np.int64)
    tiled_starts[1:] = np.cumsum(sizes ** 2)

    result = np.full(tiled_starts[-1], 0, dtype=np.int32)
    for idx in range(len(starts)):
        start = starts[idx]
        size = sizes[idx]
        an_array = np.arange(start, start+size)
        if size > 0:
            for idx2 in range(tiled_starts[idx], tiled_starts[idx+1], size):
                result[idx2:idx2+size] = an_array
    return result


def main():
    _c = np.random.randint(0, 1000, (10000))

    c_starts = np.concatenate([[0], np.cumsum(_c[:-1])])
    c_indices = np.concatenate([np.tile(np.arange(start, start + size), _c[i])
                                for i, (start, size) in enumerate(zip(c_starts, _c))])
    c_indices2 = simple_tiling_numba(c_starts, sizes=_c)
    assert (np.all(c_indices == c_indices2))


if __name__ == '__main__':
    main()

Tried parallelizing and it improves performance compared to non-parallel numba but only slightly (10-20%) at the cost of 2x time CPU consumption:

@numba.njit("int32[:](int32[:], int32[:])", parallel=True)
def simple_tiling_numba_parallel(starts, sizes):
    tiled_starts = np.zeros(len(sizes) + 1, dtype=np.int64)
    tiled_starts[1:] = np.cumsum(sizes ** 2)

    result = np.full(tiled_starts[-1], 0, dtype=np.int32)
    for idx in numba.prange(len(starts)):
        start = starts[idx]
        size = sizes[idx]
        an_array = np.arange(start, start+size)
        if size > 0:
            for idx2 in range(tiled_starts[idx], tiled_starts[idx+1], size):
                result[idx2:idx2+size] = an_array
    return result