Search code examples
pandasnumpyvectorization

Faster way of implementing pd.replace on subset of columns


def replace_inf(df):
     all_columns = list(df.columns)
     no_infs = ['some_col', 'some_col']
     inf_cols = [c for c in all_columns if c not in no_infs]
     replace = [np.nan, np.inf, -np.inf]
     for col in inf_cols:
        df[col] = df[col].replace(replace, 0, regex=True)
        df[col] = df[col].astype(np.float32)

Currently this is taking 3s for a subset of my columns and many times that for all columns. I think map, apply, lambda and vectorize could help but I'm having trouble writing something that works.


Solution

  • You can try and parallelize the task:

    import numba as nb
    
    
    @nb.jit(parallel=True)
    def _replace_inf_nb(m):
        for col in nb.prange(m.shape[1]):
            for row in range(m.shape[0]):
                v = m[row, col]
                if np.isinf(v) or np.isnan(v):
                    m[row, col] = 0
        return m
    
    
    def replace_inf_numba(df):
        all_columns = list(df.columns)
        no_infs = ["col4"]  # columns to not replace
        inf_cols = [c for c in all_columns if c not in no_infs]
        df.loc[:, inf_cols] = _replace_inf_nb(df[inf_cols].values)
    

    Benchmark (using dataframe 5000 columns/100 rows - with random 10 NaNs/10 inf/10 -inf in each column):

    from timeit import timeit
    
    import numba as nb
    import numpy as np
    import pandas as pd
    
    
    def get_df(n_col, N, num_nan=1_000, num_inf=1_000, num_ninf=1_000):
        df = pd.DataFrame(
            {f"col{n}": np.random.random(size=N) * 10 for n in range(1, n_col + 1)}
        )
    
        for c in df.columns:
            df.loc[np.random.randint(0, N, size=num_nan), c] = np.nan
            df.loc[np.random.randint(0, N, size=num_inf), c] = np.inf
            df.loc[np.random.randint(0, N, size=num_ninf), c] = -np.inf
    
        return df
    
    
    def replace_inf(df):
        all_columns = list(df.columns)
        no_infs = ["col4"]
        inf_cols = [c for c in all_columns if c not in no_infs]
        replace = [np.nan, np.inf, -np.inf]
        df[inf_cols] = df[inf_cols].replace(replace, 0)
    
    
    @nb.jit(parallel=True)
    def _replace_inf_nb(m):
        for col in nb.prange(m.shape[1]):
            for row in range(m.shape[0]):
                v = m[row, col]
                if np.isinf(v) or np.isnan(v):
                    m[row, col] = 0
        return m
    
    
    def replace_inf_numba(df):
        all_columns = list(df.columns)
        no_infs = ["col4"]  # columns to not replace
        inf_cols = [c for c in all_columns if c not in no_infs]
        df.loc[:, inf_cols] = _replace_inf_nb(df[inf_cols].values)
    
    
    df1 = get_df(50_000, 100, num_nan=10, num_inf=10, num_ninf=10)
    df2 = df1.copy()
    
    replace_inf(df1)
    replace_inf_numba(df2)
    
    assert np.allclose(df1, df2, equal_nan=True)
    
    t1 = timeit(
        "replace_inf(df)",
        setup="df = get_df(5000, 100, num_nan=10, num_inf=10, num_ninf=10)",
        globals=globals(),
        number=1,
    )
    
    t2 = timeit(
        "replace_inf_numba(df)",
        setup="df = get_df(5000, 100, num_nan=10, num_inf=10, num_ninf=10)",
        globals=globals(),
        number=1,
    )
    
    print(t1)
    print(t2)
    

    Prints on my computer (AMD 5700x):

    0.41479378099757014
    0.004868026000622194
    

    With 50_000 columns/100 rows:

    12.89557717199932
    0.04892206600197824
    

    With 150_000 columns/100 rows:

    95.10914388500532
    0.13103608299570624