Improve performance through vectorising code and avoiding pandas apply

import pandas as pd
import numpy as np


def impute_row_median(
    s: pd.Series,
    threshold: float
) -> pd.Series:
    '''For a vector of values, impute nans with median if %nan is below threshold'''
    nan_mask = s.isna()
    if nan_mask.any() and ((nan_mask.sum() / s.size) * 100) < threshold:
        s_median = s.median(skipna=True)
        s[nan_mask] = s_median
    return s  # dtype: float


df = pd.DataFrame(np.random.uniform(0, 1, size=(1000, 5)))
df = df.mask(df < 0.5)
df.apply(impute_row_median, axis=1, threshold=80)  # slow

The following apply is pretty slow (I didn't use timeit since I have nothing to compare it to). My usual approach would be to avoid apply and instead use vectorised functions like np.where but I can't presently manage to conceive of a way to do that here. Does anyone have any suggestions? Thank you!

Solution

For count percentage of missing values use mean with boolean mask, chain 2d mask with 1d mask in numpy by broadcasting and replace missing values in DataFrame.mask:

threshold = 80

mask = df.isna()
m = mask.mean(axis=1) * 100 < threshold 
df1 = df.mask(mask & m.to_numpy()[:, None], df.median(axis=1, skipna=True), axis=0)

Similar idea with numpy.where:

mask = df.isna()
m = mask.mean(axis=1) * 100 < threshold
arr = np.where(mask & m.to_numpy()[:, None], 
               df.median(axis=1, skipna=True).to_numpy()[:, None], 
               df)

df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)

threshold = 80

a = df.to_numpy()

mask = np.isnan(a)
m = np.mean(mask, axis=1) * 100 < threshold
arr = np.where(mask & m[:, None], np.nanmedian(a, axis=1)[:, None], df)

df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)

print (df1.equals(df.apply(impute_row_median, axis=1, threshold=80)))
True

Performance comparison (10k rows, 50 columns):

np.random.seed(2023)
df = pd.DataFrame(np.random.uniform(0, 1, size=(10000, 50)))
df = df.mask(df < 0.5)

In [130]: %timeit df.apply(impute_row_median, axis=1, threshold=80)
2.12 s ± 370 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [131]: %%timeit
     ...: a = df.to_numpy()
     ...: 
     ...: mask = np.isnan(a)
     ...: m = np.mean(mask, axis=1) * 100 < threshold
     ...: arr = np.where(mask & m[:, None], np.nanmedian(a, axis=1)[:, None], df)
     ...: 
     ...: df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
     ...: 
29.5 ms ± 330 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [132]: %%timeit
     ...: threshold = 80
     ...: 
     ...: mask = df.isna()
     ...: m = mask.mean(axis=1) * 100 < threshold 
     ...: df1 = df.mask(mask & m.to_numpy()[:, None],df.median(axis=1, skipna=True),axis=0)
     ...: 
18.6 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [133]: %%timeit
     ...: mask = df.isna()
     ...: m = mask.mean(axis=1) * 100 < threshold
     ...: arr = np.where(mask & m.to_numpy()[:, None], 
     ...:                df.median(axis=1, skipna=True).to_numpy()[:, None], 
     ...:                df)
     ...: 
     ...: df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
     ...: 
     ...: 
10.2 ms ± 230 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)