I wonder if anyone could please help me do this operation in python as efficiently as possible. I think numpy fancy indexing is the best approach but I have not been able to make it work (any alternative approach is also welcome)
I have simple data sets which take the shape of curves like this:
I want to compute their image represetation: a matrix where values under the curve are 1 and above 0:
My current approach consists in using np.searchsorted to map the 1d array values to the number of "pixels" in the image:
import numpy as np
from matplotlib import pyplot as plt
# 1D data
n_data = 8
x_data = np.arange(n_data)
y_data = np.array([0.2, 2.3, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9])
# 2D empty data array
y_matrix = np.zeros((n_data, n_data))
rows = np.arange(n_data)[:, np.newaxis]
# Approximation to a 0 to 7 interval
threshold_array = np.array([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])
approximation = np.searchsorted(threshold_array, y_data)
Afterwards a simple substraction can generate the 2D mask:
# Use broadcasting to create a boolean mask for the elements to be set to 1
mask = n_data - 1 - rows < approximation
y_matrix[mask] = 1
print(y_matrix)
This mask looks like:
[[0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 0. 0. 0.]
[0. 0. 1. 1. 1. 0. 0. 0.]
[0. 0. 1. 1. 1. 1. 0. 0.]
[0. 0. 1. 1. 1. 1. 0. 0.]
[0. 1. 1. 1. 1. 1. 1. 0.]
[0. 1. 1. 1. 1. 1. 1. 0.]
[1. 1. 1. 1. 1. 1. 1. 1.]]
Now I would like to apply this opertation to hundreds of thousands of vectors. For example with:
y_data = np.array([[0.2, 2.3, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9],
[0.2, 2.3, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9],
[0.2, 2.3, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9]])
Should result into three matrices or an array with (8,8,3) shape. Would anyone offer an efficient approach?
Thanks
One possible solution would be to use numba (and speedup is quite large for large number of vectors):
from numba import njit
@njit
def compute_repr_numba(y_data):
x, y = y_data.shape
out = np.zeros(shape=(x, y, y), dtype=np.uint8)
for i in range(len(y_data)):
vec = y_data[i]
out_arr = out[i]
for j, val in enumerate(vec):
for k in range(y - 1, y - int(np.ceil(val)) - 1, -1):
out_arr[k, j] = 1
return out
y_data = np.array(
[
[0.2, 2.3, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9],
[0.0, 0.0, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9],
]
)
print(compute_repr_numba_single(y_data))
Prints:
[[[0 0 0 1 0 0 0 0]
[0 0 0 1 1 0 0 0]
[0 0 1 1 1 0 0 0]
[0 0 1 1 1 1 0 0]
[0 0 1 1 1 1 0 0]
[0 1 1 1 1 1 1 0]
[0 1 1 1 1 1 1 0]
[1 1 1 1 1 1 1 1]]
[[0 0 0 1 0 0 0 0]
[0 0 0 1 1 0 0 0]
[0 0 1 1 1 0 0 0]
[0 0 1 1 1 1 0 0]
[0 0 1 1 1 1 0 0]
[0 0 1 1 1 1 1 0]
[0 0 1 1 1 1 1 0]
[0 0 1 1 1 1 1 1]]]
Benchmark (+ using parallelization):
import perfplot
from numba import njit, prange
def compute_repr_normal(y_data):
x, y = y_data.shape
threshold_array = np.array([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])
out = np.zeros(shape=(x, y, y), dtype=np.uint8)
for i, vec in enumerate(y_data):
rows = np.arange(y)[:, np.newaxis]
approximation = np.searchsorted(threshold_array, vec)
mask = y - 1 - rows < approximation
out[i, mask] = 1
return out
@njit
def compute_repr_numba_single(y_data):
x, y = y_data.shape
out = np.zeros(shape=(x, y, y), dtype=np.uint8)
for i in range(len(y_data)):
vec = y_data[i]
out_arr = out[i]
for j, val in enumerate(vec):
for k in range(y - 1, y - int(np.ceil(val)) - 1, -1):
out_arr[k, j] = 1
return out
@njit(parallel=True)
def compute_repr_numba_parallel(y_data):
x, y = y_data.shape
out = np.zeros(shape=(x, y, y), dtype=np.uint8)
for i in prange(len(y_data)):
vec = y_data[i]
out_arr = out[i]
for j, val in enumerate(vec):
for k in range(y - 1, y - int(np.ceil(val)) - 1, -1):
out_arr[k, j] = 1
return out
def compute_repr_oskar(y_data):
x, y = y_data.shape
x_data = np.arange(y)
images = np.tile(y_data[:, None, :], (1, y, 1))
images = images > x_data[::-1, None]
images = images.astype(np.uint8)
return images
y_data = np.array(
[
[0.0, 0.0, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9],
]
)
assert np.allclose(compute_repr_numba_parallel(y_data), compute_repr_normal(y_data))
assert np.allclose(compute_repr_numba_single(y_data), compute_repr_normal(y_data))
assert np.allclose(compute_repr_oskar(y_data), compute_repr_normal(y_data))
np.random.seed(0)
perfplot.show(
setup=lambda n: np.random.random(size=(n, 8)),
kernels=[
compute_repr_normal,
compute_repr_numba_parallel,
compute_repr_numba_single,
compute_repr_oskar,
],
labels=["normal", "numba_parallel", "numba_single", "compute_repr_oskar"],
n_range=[5, 10, 25, 100, 500, 1000, 10_000, 50_000, 250_000],
xlabel="N",
logx=True,
logy=True,
equality_check=np.allclose,
)
On my machine (AMD 5700x) produces this graph: