# Computing multiple 1d curves into their 2d array (image) representation

I wonder if anyone could please help me do this operation in python as efficiently as possible. I think numpy fancy indexing is the best approach but I have not been able to make it work (any alternative approach is also welcome)

I have simple data sets which take the shape of curves like this:

I want to compute their image represetation: a matrix where values under the curve are 1 and above 0:

My current approach consists in using np.searchsorted to map the 1d array values to the number of "pixels" in the image:

import numpy as np
from matplotlib import pyplot as plt

# 1D data
n_data = 8
x_data = np.arange(n_data)
y_data = np.array([0.2, 2.3, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9])

# 2D empty data array
y_matrix = np.zeros((n_data, n_data))
rows = np.arange(n_data)[:, np.newaxis]

# Approximation to a 0 to 7 interval
threshold_array = np.array([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])
approximation = np.searchsorted(threshold_array, y_data)

Afterwards a simple substraction can generate the 2D mask:

# Use broadcasting to create a boolean mask for the elements to be set to 1
mask = n_data - 1 - rows < approximation
y_matrix[mask] = 1

print(y_matrix)

This mask looks like:

[[0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 0.]
 [0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 0. 1. 1. 1. 1. 0. 0.]
 [0. 0. 1. 1. 1. 1. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1.]]

Now I would like to apply this opertation to hundreds of thousands of vectors. For example with:

y_data = np.array([[0.2, 2.3, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9],
                  [0.2, 2.3, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9],
                  [0.2, 2.3, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9]])

Should result into three matrices or an array with (8,8,3) shape. Would anyone offer an efficient approach?

Thanks

Solution

One possible solution would be to use numba (and speedup is quite large for large number of vectors):

from numba import njit

@njit
def compute_repr_numba(y_data):
    x, y = y_data.shape

    out = np.zeros(shape=(x, y, y), dtype=np.uint8)

    for i in range(len(y_data)):
        vec = y_data[i]
        out_arr = out[i]
        for j, val in enumerate(vec):
            for k in range(y - 1, y - int(np.ceil(val)) - 1, -1):
                out_arr[k, j] = 1

    return out

y_data = np.array(
    [
        [0.2, 2.3, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9],
        [0.0, 0.0, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9],
    ]
)
print(compute_repr_numba_single(y_data))

Prints:

[[[0 0 0 1 0 0 0 0]
  [0 0 0 1 1 0 0 0]
  [0 0 1 1 1 0 0 0]
  [0 0 1 1 1 1 0 0]
  [0 0 1 1 1 1 0 0]
  [0 1 1 1 1 1 1 0]
  [0 1 1 1 1 1 1 0]
  [1 1 1 1 1 1 1 1]]

 [[0 0 0 1 0 0 0 0]
  [0 0 0 1 1 0 0 0]
  [0 0 1 1 1 0 0 0]
  [0 0 1 1 1 1 0 0]
  [0 0 1 1 1 1 0 0]
  [0 0 1 1 1 1 1 0]
  [0 0 1 1 1 1 1 0]
  [0 0 1 1 1 1 1 1]]]

Benchmark (+ using parallelization):

import perfplot
from numba import njit, prange


def compute_repr_normal(y_data):
    x, y = y_data.shape
    threshold_array = np.array([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])

    out = np.zeros(shape=(x, y, y), dtype=np.uint8)

    for i, vec in enumerate(y_data):
        rows = np.arange(y)[:, np.newaxis]
        approximation = np.searchsorted(threshold_array, vec)
        mask = y - 1 - rows < approximation
        out[i, mask] = 1

    return out


@njit
def compute_repr_numba_single(y_data):
    x, y = y_data.shape

    out = np.zeros(shape=(x, y, y), dtype=np.uint8)

    for i in range(len(y_data)):
        vec = y_data[i]
        out_arr = out[i]
        for j, val in enumerate(vec):
            for k in range(y - 1, y - int(np.ceil(val)) - 1, -1):
                out_arr[k, j] = 1

    return out


@njit(parallel=True)
def compute_repr_numba_parallel(y_data):
    x, y = y_data.shape

    out = np.zeros(shape=(x, y, y), dtype=np.uint8)

    for i in prange(len(y_data)):
        vec = y_data[i]
        out_arr = out[i]
        for j, val in enumerate(vec):
            for k in range(y - 1, y - int(np.ceil(val)) - 1, -1):
                out_arr[k, j] = 1

    return out


def compute_repr_oskar(y_data):
    x, y = y_data.shape
    x_data = np.arange(y)

    images = np.tile(y_data[:, None, :], (1, y, 1))
    images = images > x_data[::-1, None]
    images = images.astype(np.uint8)
    return images


y_data = np.array(
    [
        [0.0, 0.0, 5.9, 7.2, 6.2, 4.7, 2.9, 0.9],
    ]
)

assert np.allclose(compute_repr_numba_parallel(y_data), compute_repr_normal(y_data))
assert np.allclose(compute_repr_numba_single(y_data), compute_repr_normal(y_data))
assert np.allclose(compute_repr_oskar(y_data), compute_repr_normal(y_data))

np.random.seed(0)

perfplot.show(
    setup=lambda n: np.random.random(size=(n, 8)),
    kernels=[
        compute_repr_normal,
        compute_repr_numba_parallel,
        compute_repr_numba_single,
        compute_repr_oskar,
    ],
    labels=["normal", "numba_parallel", "numba_single", "compute_repr_oskar"],
    n_range=[5, 10, 25, 100, 500, 1000, 10_000, 50_000, 250_000],
    xlabel="N",
    logx=True,
    logy=True,
    equality_check=np.allclose,
)

On my machine (AMD 5700x) produces this graph: