Consider the following code. The purpose of this code is to go through a groupby
, and combine information from each of the rows in the groups and save it into a pd.Series
.
import numpy as np
df = pd.DataFrame({"A" : ["a", "a", "b", "b", "a"], "B" : ["x", "y", "z", "w", "t"] })
groups = df.groupby("A")
strings = groups["B"].apply(lambda x : "".join(sorted(x.values + " ")))
strings2 = groups["B"].apply(lambda x : str(len(np.unique(x))) + "hello")
print(strings)
print(strings2)
Outputs
a t x y
b w z
Name: B, dtype: object
A
a 3hello
b 2hello
Name: B, dtype: object
For example, the first apply statement in the code sample above is to concatenate the rows of column "B"
into one string, with spaces added.
I would like to make this code as fast as possible and to be able to scale it to the setting of multiple millions of rows.
This is the most efficient way I've been able to find so far to do this task, but I think part of the problem is that .apply
is not vectorised in pandas, so applying this to each group is a very slow process (typically, there are more than 300,000 groups). Each group only has size 2 or 3.
Alternatively I could try skipping the groupby
and doing this directly on the original DataFrame
. A solution that works this way would also be appreciated.
In summary, I would like a way to rewrite the above code that achieves the same thing, but as fast and space efficient as possible (scalable to extremely large datasets).
The fastest solution for strings2
I found was:
def cast_multiple_apply_approach(dfg):
return dfg["B"].apply(pd.unique).map(len).map(repr) + "hello"
For strings2
the OP's method is quite slow:
import numpy as np
import pandas as pd
_rng = np.random.default_rng(123)
def setup(N):
x = ["a", "a", "b", "b", "a"] * N
_rng.shuffle(x)
y = ["x", "y", "z", "w", "t"] * N
_rng.shuffle(y)
df = pd.DataFrame({"A": x, "B": y})
groups = df.groupby("A")
return [groups]
def original_poster_approach(dfg):
return dfg["B"].apply(lambda x: str(len(np.unique(x))) + "hello")
def bracula_approach(dfg):
return dfg["B"].apply(lambda x: f"{len(pd.unique(x))}hello")
def len_multiple_apply_approach(dfg):
return dfg["B"].apply(pd.unique).map(len).astype("string") + "hello"
def npsize_multiple_apply_approach(dfg):
return dfg["B"].apply(pd.unique).map(np.size).astype("string") + "hello"
def set_multiple_apply_approach(dfg):
return dfg["B"].apply(set).map(len).astype("string") + "hello"
def cast_multiple_apply_approach(dfg):
return dfg["B"].apply(pd.unique).map(len).map(repr) + "hello"
approaches = [
original_poster_approach,
bracula_approach,
len_multiple_apply_approach,
npsize_multiple_apply_approach,
set_multiple_apply_approach,
cast_multiple_apply_approach,
]
for approach in approaches[1:]:
data = setup(100)
assert (approach(*data) == approaches[0](*data)).all()
run_performance_comparison(
approaches,
[
1000,
3000,
5000,
10000,
30000,
100_000,
], # ,300_000,500_000,1_000_000],#3_000_000,5_000_000,10_000_000],
setup=setup,
title="Performance Comparison",
number_of_repetitions=1,
)
Therefore I excluded it from running on a large sample:
For strings
, I was surprised to find that:
I could improve a little on OP's solution by using native pd.Series.str.join
over the joining in pure Python:
def pandas_str_accessor_approach(df):
return df.groupby("A")["B"].apply(sorted).str.join(" ")
def setup(N):
x = ["a", "a", "b", "b", "a"] * N
_rng.shuffle(x)
y = ["x", "y", "z", "w", "t"] * N
_rng.shuffle(y)
df = pd.DataFrame({"A": x, "B": y})
return [df]
def original_poster_approach(df):
return df.groupby("A")["B"].apply(lambda x: "".join(sorted(x.values + " ")))
def pandas_str_accessor_approach(df):
return df.groupby("A")["B"].apply(sorted).str.join(" ")
def sort_first_approach(df):
df.sort_values(by=["B"], inplace=True)
strings = df.groupby("A")["B"].apply(" ".join)
return strings
approaches = [
original_poster_approach,
pandas_str_accessor_approach,
sort_first_approach,
]
run_performance_comparison(
approaches,
[
1000,
3000,
5000,
10000,
30000,
100_000,
300_000,
500_000,
1_000_000,
3_000_000,
], # 5_000_000,10_000_000],
setup=setup,
title="Performance Comparison",
number_of_repetitions=1,
)
Profiling code used:
import timeit
import matplotlib.pyplot as plt
from typing import List, Dict, Callable
from contextlib import contextmanager
@contextmanager
def data_provider(data_size, setup=lambda N: N, teardown=lambda: None):
data = setup(data_size)
yield data
teardown()
def run_performance_comparison(approaches: List[Callable],
data_size: List[int],
setup=lambda N: N,
teardown=lambda: None,
number_of_repetitions=5, title='Performance Comparison',data_name='N'):
approach_times: Dict[Callable, List[float]] = {approach: [] for approach in approaches}
for N in data_size:
with data_provider(N, setup, teardown) as data:
for approach in approaches:
approach_time = timeit.timeit(lambda: approach(*data), number=number_of_repetitions)
approach_times[approach].append(approach_time)
for approach in approaches:
plt.plot(data_size, approach_times[approach], label=approach.__name__)
plt.yscale('log')
plt.xscale('log')
plt.xlabel(data_name)
plt.ylabel('Execution Time (seconds)')
plt.title(title)
plt.legend()
plt.show()