My end goal is to get c_indices
as described here. However, I'm unable to find any good optimization to increase the speed. Is it even possible to perform some kind of vectorization to reduce the time complexity?
_c = np.random.randint(0,1000, (10000))
c_starts = np.concatenate([[0], np.cumsum(_c[:-1])])
c_indices = np.concatenate([np.tile(np.arange(start, start + size), _c[i])
for i, (start, size) in enumerate(zip(c_starts, _c))])
for example for a given _c = [3, 4, 2]
, I would expect the following output
>>> _c
[3, 4, 2]
>>> c_starts
array([0, 3, 7])
>>> c_indices
array([0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 5, 6, 3, 4, 5, 6, 3, 4,
5, 6, 3, 4, 5, 6, 7, 8, 7, 8])
#based on _c = [3, 4, 2]
#[0, 1, 2] -> repeated 3 times
#[3, 4, 5, 6] -> repeated 4 times
#[7, 8] -> repeated 2 times
I have numba solution that takes about 60% of the original time (at least on my machine). It also should significantly lower memory consumption.
import numpy as np
import numba
@numba.njit("int32[:](int32[:], int32[:])")
def simple_tiling_numba(starts, sizes):
tiled_starts = np.zeros(len(sizes) + 1, dtype=np.int64)
tiled_starts[1:] = np.cumsum(sizes ** 2)
result = np.full(tiled_starts[-1], 0, dtype=np.int32)
for idx in range(len(starts)):
start = starts[idx]
size = sizes[idx]
an_array = np.arange(start, start+size)
if size > 0:
for idx2 in range(tiled_starts[idx], tiled_starts[idx+1], size):
result[idx2:idx2+size] = an_array
return result
def main():
_c = np.random.randint(0, 1000, (10000))
c_starts = np.concatenate([[0], np.cumsum(_c[:-1])])
c_indices = np.concatenate([np.tile(np.arange(start, start + size), _c[i])
for i, (start, size) in enumerate(zip(c_starts, _c))])
c_indices2 = simple_tiling_numba(c_starts, sizes=_c)
assert (np.all(c_indices == c_indices2))
if __name__ == '__main__':
main()
Tried parallelizing and it improves performance compared to non-parallel numba but only slightly (10-20%) at the cost of 2x time CPU consumption:
@numba.njit("int32[:](int32[:], int32[:])", parallel=True)
def simple_tiling_numba_parallel(starts, sizes):
tiled_starts = np.zeros(len(sizes) + 1, dtype=np.int64)
tiled_starts[1:] = np.cumsum(sizes ** 2)
result = np.full(tiled_starts[-1], 0, dtype=np.int32)
for idx in numba.prange(len(starts)):
start = starts[idx]
size = sizes[idx]
an_array = np.arange(start, start+size)
if size > 0:
for idx2 in range(tiled_starts[idx], tiled_starts[idx+1], size):
result[idx2:idx2+size] = an_array
return result