import pandas as pd
import numpy as np
def impute_row_median(
s: pd.Series,
threshold: float
) -> pd.Series:
'''For a vector of values, impute nans with median if %nan is below threshold'''
nan_mask = s.isna()
if nan_mask.any() and ((nan_mask.sum() / s.size) * 100) < threshold:
s_median = s.median(skipna=True)
s[nan_mask] = s_median
return s # dtype: float
df = pd.DataFrame(np.random.uniform(0, 1, size=(1000, 5)))
df = df.mask(df < 0.5)
df.apply(impute_row_median, axis=1, threshold=80) # slow
The following apply is pretty slow (I didn't use timeit since I have nothing to compare it to). My usual approach would be to avoid apply and instead use vectorised functions like np.where but I can't presently manage to conceive of a way to do that here. Does anyone have any suggestions? Thank you!
For count percentage of missing values use mean
with boolean mask, chain 2d mask
with 1d mask
in numpy by broadcasting and replace missing values in DataFrame.mask
:
threshold = 80
mask = df.isna()
m = mask.mean(axis=1) * 100 < threshold
df1 = df.mask(mask & m.to_numpy()[:, None], df.median(axis=1, skipna=True), axis=0)
Similar idea with numpy.where
:
mask = df.isna()
m = mask.mean(axis=1) * 100 < threshold
arr = np.where(mask & m.to_numpy()[:, None],
df.median(axis=1, skipna=True).to_numpy()[:, None],
df)
df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
threshold = 80
a = df.to_numpy()
mask = np.isnan(a)
m = np.mean(mask, axis=1) * 100 < threshold
arr = np.where(mask & m[:, None], np.nanmedian(a, axis=1)[:, None], df)
df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
print (df1.equals(df.apply(impute_row_median, axis=1, threshold=80)))
True
Performance comparison (10k rows, 50 columns):
np.random.seed(2023)
df = pd.DataFrame(np.random.uniform(0, 1, size=(10000, 50)))
df = df.mask(df < 0.5)
In [130]: %timeit df.apply(impute_row_median, axis=1, threshold=80)
2.12 s ± 370 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [131]: %%timeit
...: a = df.to_numpy()
...:
...: mask = np.isnan(a)
...: m = np.mean(mask, axis=1) * 100 < threshold
...: arr = np.where(mask & m[:, None], np.nanmedian(a, axis=1)[:, None], df)
...:
...: df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
...:
29.5 ms ± 330 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [132]: %%timeit
...: threshold = 80
...:
...: mask = df.isna()
...: m = mask.mean(axis=1) * 100 < threshold
...: df1 = df.mask(mask & m.to_numpy()[:, None],df.median(axis=1, skipna=True),axis=0)
...:
18.6 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [133]: %%timeit
...: mask = df.isna()
...: m = mask.mean(axis=1) * 100 < threshold
...: arr = np.where(mask & m.to_numpy()[:, None],
...: df.median(axis=1, skipna=True).to_numpy()[:, None],
...: df)
...:
...: df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
...:
...:
10.2 ms ± 230 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)