Search code examples
pythonpandasgroup-by

groupby.rolling slower than with for loop


I want to perfomr a groupby rolling mean on a large dataset.

With large df, doign groupby.rolling seems to be slower than using for loop:

This

grouped['value'].rolling(window=window_size, min_periods=0).mean().reset_index(level=0, drop=True)

seems to be slower than:

for key, grp in grouped:
    grp['rolling_mean'] = grp['value'].rolling(window=window_size, min_periods=0).mean()
    rolling_mean_df = pd.concat([rolling_mean_df, grp])

why is it? and what method can i use to speed up rolling mean with groupby on large df.

import matplotlib.pyplot as plt
import time
import numpy as np
import pandas as pd

# Define different sizes to test
sizes = [10, 100, 1000, 10000, 20000, 30000, 50000, 100000, 200000, 500000]
window_size = 10

# Lists to store execution times for each method
times_groupby_1 = []
times_groupby_2 = []

for size in sizes:
    # Create a DataFrame for each size
    data = {'value': np.random.rand(size), 'group': np.random.choice(['A', 'B', 'C', 'D'], size)}
    sample_df = pd.DataFrame(data)
    grouped = sample_df.groupby('group')

    # Method 1: Using groupby with rolling mean (for loop method)
    start_time = time.time()
    rolling_mean_df = pd.DataFrame()
    for key, grp in grouped:
        grp['rolling_mean'] = grp['value'].rolling(window=window_size, min_periods=0).mean()
        rolling_mean_df = pd.concat([rolling_mean_df, grp])
    end_time = time.time()
    times_groupby_1.append(end_time - start_time)

    # Method 2: Using groupby with rolling mean (direct method)
    start_time = time.time()
    sample_df['rolling_mean'] = grouped['value'].rolling(window=window_size, min_periods=0).mean().reset_index(level=0, drop=True)
    end_time = time.time()
    times_groupby_2.append(end_time - start_time)

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(sizes, times_groupby_1, label='Method 1 (for loop)', marker='o')
plt.plot(sizes, times_groupby_2, label='Method 2 (direct)', marker='o')
plt.xlabel('DataFrame Size')
plt.ylabel('Time (seconds)')
plt.title('Comparison of Execution Times for Rolling Mean Calculation')
plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.grid(True)
plt.show()

enter image description here


Solution

  • It all depends on character of your data. If you have only few groups and many values (as in your example), then standard .rolling() will be very fast (.mean() is highly optimized).

    But if your data contains many groups you can try to come with custom solutions (to remove the overhead of concatenating the dataframes etc.). For example you can use :

    from numba import float64, int64, njit
    from numba.typed import Dict, List
    from numba.types import ListType
    
    list_type_float64 = ListType(float64)
    list_type_int64 = ListType(int64)
    
    
    @njit
    def _mean(lst):
        s = 0
        for v in lst:
            s = s + v
        return s / max(len(lst), 1)
    
    
    @njit
    def _get_rolling_mean_numba(index, group, value, size):
        groups = Dict.empty(int64, list_type_float64)
        indices = Dict.empty(int64, list_type_int64)
        out = np.empty_like(value)
    
        for i, g, v in zip(index, group, value):
            groups.setdefault(g, List.empty_list(float64)).append(v)
            indices.setdefault(g, List.empty_list(int64)).append(i)
    
        cnt = 0
        for k, vals in groups.items():
            for i in range(0, len(vals)):
                out[cnt] = _mean(vals[max(0, i - size) : i + 1])
                cnt += 1
    
        return out, [i for ind in indices.values() for i in ind]
    
    
    def get_rolling_mean_numba(df, size):
        v, i = _get_rolling_mean_numba(
            df.index.to_numpy(),
            df.group.to_numpy(),
            df.value.to_numpy(),
            size,
        )
        df["rolling_mean"] = pd.Series(v, index=i)
        return df
    

    Benchmark using perfplot (where the number of groups are ~1/3 of total values):

    from statistics import mean
    
    import numpy as np
    import pandas as pd
    import perfplot
    from matplotlib import pyplot as plt
    from numba import float64, int64, njit
    from numba.typed import Dict, List
    from numba.types import ListType
    
    plt.rcParams["figure.autolayout"] = True
    
    
    list_type_float64 = ListType(float64)
    list_type_int64 = ListType(int64)
    
    
    def get_sample_df(size=10):
        np.random.seed(42)
    
        sample_df = pd.DataFrame(
            {
                "value": np.random.rand(size),
                "group": np.random.choice(range(size // 3), size),
            }
        )
        return sample_df
    
    
    @njit
    def _mean(lst):
        s = 0
        for v in lst:
            s = s + v
        return s / max(len(lst), 1)
    
    
    @njit
    def _get_rolling_mean_numba(index, group, value, size):
        groups = Dict.empty(int64, list_type_float64)
        indices = Dict.empty(int64, list_type_int64)
        out = np.empty_like(value)
    
        for i, g, v in zip(index, group, value):
            groups.setdefault(g, List.empty_list(float64)).append(v)
            indices.setdefault(g, List.empty_list(int64)).append(i)
    
        cnt = 0
        for k, vals in groups.items():
            for i in range(0, len(vals)):
                out[cnt] = _mean(vals[max(0, i - size) : i + 1])
                cnt += 1
    
        return out, [i for ind in indices.values() for i in ind]
    
    
    def get_rolling_mean_numba(df, size):
        v, i = _get_rolling_mean_numba(
            df.index.to_numpy(),
            df.group.to_numpy(),
            df.value.to_numpy(),
            size,
        )
        df["rolling_mean"] = pd.Series(v, index=i)
        return df
    
    
    def get_rolling_mean_original(df, size):
        grouped = df.groupby("group")
    
        df["rolling_mean"] = (
            grouped["value"]
            .rolling(window=size, min_periods=0)
            .mean()
            .reset_index(level=0, drop=True)
        )
    
        return df
    
    
    def get_rolling_mean_original_2(df, size):
        grouped = df.groupby("group")
        rolling_mean_df = pd.DataFrame()
    
        for key, grp in grouped:
            grp["rolling_mean"] = grp["value"].rolling(window=size, min_periods=0).mean()
            rolling_mean_df = pd.concat([rolling_mean_df, grp])
    
        return rolling_mean_df
    
    
    def get_rolling_mean_standard(df, size):
        out, groups, indices = [], {}, {}
        for i, g, v in zip(df.index, df["group"], df["value"]):
            groups.setdefault(g, []).append(v)
            indices.setdefault(g, []).append(i)
    
        for k, vals in groups.items():
            groups[k] = [mean(vals[max(0, i - size) : i + 1]) for i in range(0, len(vals))]
    
        df["rolling_mean"] = pd.Series(
            [v for vals in groups.values() for v in vals],
            index=[i for indices in indices.values() for i in indices],
        )
        return df
    
    
    print(get_rolling_mean_original(get_sample_df(), 10).sort_values(by=["group", "value"]))
    print(
        get_rolling_mean_original_2(get_sample_df(), 10).sort_values(by=["group", "value"])
    )
    print(get_rolling_mean_standard(get_sample_df(), 10).sort_values(by=["group", "value"]))
    
    # this compiles the numba version:
    print(get_rolling_mean_numba(get_sample_df(), 10).sort_values(by=["group", "value"]))
    
    
    perfplot.show(
        setup=lambda size: pd.DataFrame(
            {
                "value": np.random.rand(size),
                "group": np.random.choice(range(size // 3), size),
            }
        ),
        kernels=[
            lambda df: get_rolling_mean_original(df, 10),
            lambda df: get_rolling_mean_original_2(df, 10),
            lambda df: get_rolling_mean_standard(df, 10),
            lambda df: get_rolling_mean_numba(df, 10),
        ],
        labels=["original 1", "original 2", "plain python", "numba"],
        n_range=[10, 100, 1000, 10000, 20000, 30000, 50000, 100000, 200000],
        xlabel="N",
        logx=True,
        logy=True,
        equality_check=None,
    )
    

    Result:

    enter image description here