Search code examples
pythonpandasdataframenumpyapply

Improve performance through vectorising code and avoiding pandas apply


import pandas as pd
import numpy as np


def impute_row_median(
    s: pd.Series,
    threshold: float
) -> pd.Series:
    '''For a vector of values, impute nans with median if %nan is below threshold'''
    nan_mask = s.isna()
    if nan_mask.any() and ((nan_mask.sum() / s.size) * 100) < threshold:
        s_median = s.median(skipna=True)
        s[nan_mask] = s_median
    return s  # dtype: float


df = pd.DataFrame(np.random.uniform(0, 1, size=(1000, 5)))
df = df.mask(df < 0.5)
df.apply(impute_row_median, axis=1, threshold=80)  # slow

The following apply is pretty slow (I didn't use timeit since I have nothing to compare it to). My usual approach would be to avoid apply and instead use vectorised functions like np.where but I can't presently manage to conceive of a way to do that here. Does anyone have any suggestions? Thank you!


Solution

  • For count percentage of missing values use mean with boolean mask, chain 2d mask with 1d mask in numpy by broadcasting and replace missing values in DataFrame.mask:

    threshold = 80
    
    mask = df.isna()
    m = mask.mean(axis=1) * 100 < threshold 
    df1 = df.mask(mask & m.to_numpy()[:, None], df.median(axis=1, skipna=True), axis=0)
    

    Similar idea with numpy.where:

    mask = df.isna()
    m = mask.mean(axis=1) * 100 < threshold
    arr = np.where(mask & m.to_numpy()[:, None], 
                   df.median(axis=1, skipna=True).to_numpy()[:, None], 
                   df)
    
    df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
    

    threshold = 80
    
    a = df.to_numpy()
    
    mask = np.isnan(a)
    m = np.mean(mask, axis=1) * 100 < threshold
    arr = np.where(mask & m[:, None], np.nanmedian(a, axis=1)[:, None], df)
    
    df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
    

    print (df1.equals(df.apply(impute_row_median, axis=1, threshold=80)))
    True
    

    Performance comparison (10k rows, 50 columns):

    np.random.seed(2023)
    df = pd.DataFrame(np.random.uniform(0, 1, size=(10000, 50)))
    df = df.mask(df < 0.5)
    

    In [130]: %timeit df.apply(impute_row_median, axis=1, threshold=80)
    2.12 s ± 370 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    
    
    In [131]: %%timeit
         ...: a = df.to_numpy()
         ...: 
         ...: mask = np.isnan(a)
         ...: m = np.mean(mask, axis=1) * 100 < threshold
         ...: arr = np.where(mask & m[:, None], np.nanmedian(a, axis=1)[:, None], df)
         ...: 
         ...: df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
         ...: 
    29.5 ms ± 330 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    

    In [132]: %%timeit
         ...: threshold = 80
         ...: 
         ...: mask = df.isna()
         ...: m = mask.mean(axis=1) * 100 < threshold 
         ...: df1 = df.mask(mask & m.to_numpy()[:, None],df.median(axis=1, skipna=True),axis=0)
         ...: 
    18.6 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    

    In [133]: %%timeit
         ...: mask = df.isna()
         ...: m = mask.mean(axis=1) * 100 < threshold
         ...: arr = np.where(mask & m.to_numpy()[:, None], 
         ...:                df.median(axis=1, skipna=True).to_numpy()[:, None], 
         ...:                df)
         ...: 
         ...: df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
         ...: 
         ...: 
    10.2 ms ± 230 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)