I want to perfomr a groupby rolling mean on a large dataset.
With large df, doign groupby.rolling seems to be slower than using for loop:
This
grouped['value'].rolling(window=window_size, min_periods=0).mean().reset_index(level=0, drop=True)
seems to be slower than:
for key, grp in grouped:
grp['rolling_mean'] = grp['value'].rolling(window=window_size, min_periods=0).mean()
rolling_mean_df = pd.concat([rolling_mean_df, grp])
why is it? and what method can i use to speed up rolling mean with groupby on large df.
import matplotlib.pyplot as plt
import time
import numpy as np
import pandas as pd
# Define different sizes to test
sizes = [10, 100, 1000, 10000, 20000, 30000, 50000, 100000, 200000, 500000]
window_size = 10
# Lists to store execution times for each method
times_groupby_1 = []
times_groupby_2 = []
for size in sizes:
# Create a DataFrame for each size
data = {'value': np.random.rand(size), 'group': np.random.choice(['A', 'B', 'C', 'D'], size)}
sample_df = pd.DataFrame(data)
grouped = sample_df.groupby('group')
# Method 1: Using groupby with rolling mean (for loop method)
start_time = time.time()
rolling_mean_df = pd.DataFrame()
for key, grp in grouped:
grp['rolling_mean'] = grp['value'].rolling(window=window_size, min_periods=0).mean()
rolling_mean_df = pd.concat([rolling_mean_df, grp])
end_time = time.time()
times_groupby_1.append(end_time - start_time)
# Method 2: Using groupby with rolling mean (direct method)
start_time = time.time()
sample_df['rolling_mean'] = grouped['value'].rolling(window=window_size, min_periods=0).mean().reset_index(level=0, drop=True)
end_time = time.time()
times_groupby_2.append(end_time - start_time)
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(sizes, times_groupby_1, label='Method 1 (for loop)', marker='o')
plt.plot(sizes, times_groupby_2, label='Method 2 (direct)', marker='o')
plt.xlabel('DataFrame Size')
plt.ylabel('Time (seconds)')
plt.title('Comparison of Execution Times for Rolling Mean Calculation')
plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.grid(True)
plt.show()
It all depends on character of your data. If you have only few groups and many values (as in your example), then standard .rolling()
will be very fast (.mean()
is highly optimized).
But if your data contains many groups you can try to come with custom solutions (to remove the overhead of concatenating the dataframes etc.). For example you can use numba:
from numba import float64, int64, njit
from numba.typed import Dict, List
from numba.types import ListType
list_type_float64 = ListType(float64)
list_type_int64 = ListType(int64)
@njit
def _mean(lst):
s = 0
for v in lst:
s = s + v
return s / max(len(lst), 1)
@njit
def _get_rolling_mean_numba(index, group, value, size):
groups = Dict.empty(int64, list_type_float64)
indices = Dict.empty(int64, list_type_int64)
out = np.empty_like(value)
for i, g, v in zip(index, group, value):
groups.setdefault(g, List.empty_list(float64)).append(v)
indices.setdefault(g, List.empty_list(int64)).append(i)
cnt = 0
for k, vals in groups.items():
for i in range(0, len(vals)):
out[cnt] = _mean(vals[max(0, i - size) : i + 1])
cnt += 1
return out, [i for ind in indices.values() for i in ind]
def get_rolling_mean_numba(df, size):
v, i = _get_rolling_mean_numba(
df.index.to_numpy(),
df.group.to_numpy(),
df.value.to_numpy(),
size,
)
df["rolling_mean"] = pd.Series(v, index=i)
return df
Benchmark using perfplot
(where the number of groups are ~1/3 of total values):
from statistics import mean
import numpy as np
import pandas as pd
import perfplot
from matplotlib import pyplot as plt
from numba import float64, int64, njit
from numba.typed import Dict, List
from numba.types import ListType
plt.rcParams["figure.autolayout"] = True
list_type_float64 = ListType(float64)
list_type_int64 = ListType(int64)
def get_sample_df(size=10):
np.random.seed(42)
sample_df = pd.DataFrame(
{
"value": np.random.rand(size),
"group": np.random.choice(range(size // 3), size),
}
)
return sample_df
@njit
def _mean(lst):
s = 0
for v in lst:
s = s + v
return s / max(len(lst), 1)
@njit
def _get_rolling_mean_numba(index, group, value, size):
groups = Dict.empty(int64, list_type_float64)
indices = Dict.empty(int64, list_type_int64)
out = np.empty_like(value)
for i, g, v in zip(index, group, value):
groups.setdefault(g, List.empty_list(float64)).append(v)
indices.setdefault(g, List.empty_list(int64)).append(i)
cnt = 0
for k, vals in groups.items():
for i in range(0, len(vals)):
out[cnt] = _mean(vals[max(0, i - size) : i + 1])
cnt += 1
return out, [i for ind in indices.values() for i in ind]
def get_rolling_mean_numba(df, size):
v, i = _get_rolling_mean_numba(
df.index.to_numpy(),
df.group.to_numpy(),
df.value.to_numpy(),
size,
)
df["rolling_mean"] = pd.Series(v, index=i)
return df
def get_rolling_mean_original(df, size):
grouped = df.groupby("group")
df["rolling_mean"] = (
grouped["value"]
.rolling(window=size, min_periods=0)
.mean()
.reset_index(level=0, drop=True)
)
return df
def get_rolling_mean_original_2(df, size):
grouped = df.groupby("group")
rolling_mean_df = pd.DataFrame()
for key, grp in grouped:
grp["rolling_mean"] = grp["value"].rolling(window=size, min_periods=0).mean()
rolling_mean_df = pd.concat([rolling_mean_df, grp])
return rolling_mean_df
def get_rolling_mean_standard(df, size):
out, groups, indices = [], {}, {}
for i, g, v in zip(df.index, df["group"], df["value"]):
groups.setdefault(g, []).append(v)
indices.setdefault(g, []).append(i)
for k, vals in groups.items():
groups[k] = [mean(vals[max(0, i - size) : i + 1]) for i in range(0, len(vals))]
df["rolling_mean"] = pd.Series(
[v for vals in groups.values() for v in vals],
index=[i for indices in indices.values() for i in indices],
)
return df
print(get_rolling_mean_original(get_sample_df(), 10).sort_values(by=["group", "value"]))
print(
get_rolling_mean_original_2(get_sample_df(), 10).sort_values(by=["group", "value"])
)
print(get_rolling_mean_standard(get_sample_df(), 10).sort_values(by=["group", "value"]))
# this compiles the numba version:
print(get_rolling_mean_numba(get_sample_df(), 10).sort_values(by=["group", "value"]))
perfplot.show(
setup=lambda size: pd.DataFrame(
{
"value": np.random.rand(size),
"group": np.random.choice(range(size // 3), size),
}
),
kernels=[
lambda df: get_rolling_mean_original(df, 10),
lambda df: get_rolling_mean_original_2(df, 10),
lambda df: get_rolling_mean_standard(df, 10),
lambda df: get_rolling_mean_numba(df, 10),
],
labels=["original 1", "original 2", "plain python", "numba"],
n_range=[10, 100, 1000, 10000, 20000, 30000, 50000, 100000, 200000],
xlabel="N",
logx=True,
logy=True,
equality_check=None,
)
Result: