I have a pandas dataframe like below:
df = pd.DataFrame({"A": [1, 1, 1, 2, 2],
"B": ["apple", "apple", "banana", "pineapple", "pineapple"],
"C": [[6, 5, 2], [2, 10, 2], [5, 37, 1], [4, 19, 2], [1, 5, 1]]})
I want to perform a lambda function on the list column C
that computes the average of the lists. I know I can do that in one of the following ways:
1:
df = df.groupby(['A', 'B'])['C'].apply(lambda s: np.array(list(s)).mean(axis=0)).reset_index()
2:
s = df.set_index(['A', 'B'])
out = pd.DataFrame(list(s['C']), s.index).mean(level=[0, 1])
out.drop(out.columns.tolist(), 1).assign(C=out.values.tolist()).reset_index()
and in addition to that add a column with the size of each group. I know I could add such a size column with the following code:
df = df[['A', 'B']].groupby(['A', 'B']).size()
But can't find out how to do both. The resulting DataFrame would look like this:
A B C count
1 apple [4, 7.5, 2] 2
1 banana [5, 37, 1] 1
2 pineapple [2.5, 12, 1.5] 2
How to solve this as least time-consuming as possible? My real dataframe is very huge, so all operations need to be as time efficient as possible
Option 1:
Groupby agg
:
import numpy as np
import pandas as pd
df = pd.DataFrame({
"A": [1, 1, 1, 2, 2],
"B": ["apple", "apple", "banana", "pineapple", "pineapple"],
"C": [[6, 5, 2], [2, 10, 2], [5, 37, 1], [4, 19, 2], [1, 5, 1]]
})
df = df.groupby(['A', 'B'])['C'].agg(
[lambda s: np.array(list(s)).mean(axis=0).tolist(), 'size']
).reset_index().rename(columns={'<lambda_0>': 'C', 'size': 'count'})
print(df)
df
:
A B C count
0 1 apple [4.0, 7.5, 2.0] 2
1 1 banana [5.0, 37.0, 1.0] 1
2 2 pineapple [2.5, 12.0, 1.5] 2
Option 2 pd.concat
:
g = df.groupby(['A', 'B'])['C']
df = pd.concat(
(
g.apply(lambda s: np.array(list(s)).mean(axis=0)),
g.size()
), axis=1,
keys=['C', 'count']
).reset_index()
df
:
A B C count
0 1 apple [4.0, 7.5, 2.0] 2
1 1 banana [5.0, 37.0, 1.0] 1
2 2 pineapple [2.5, 12.0, 1.5] 2
Some timing information via perfplot:
import numpy as np
import pandas as pd
import perfplot
np.random.seed(5)
def gen_data(n):
df = pd.DataFrame({"A": [1, 1, 1, 2, 2],
"B": ["apple", "apple", "banana", "pineapple",
"pineapple"],
"C": [[6, 5, 2], [2, 10, 2], [5, 37, 1], [4, 19, 2],
[1, 5, 1]]})
return pd.concat([df.assign(B=df['B'] + str(i)) for i in range(n)],
ignore_index=True)
def chain_assign(df):
return (df.groupby(['A', 'B'])['C']
.apply(lambda s: np.array(list(s)).mean(axis=0))
.reset_index()
.assign(count=df.groupby(['A', 'B'])['C'].size().values))
def to_frame_value_counts(df):
return (df.groupby(['A', 'B'])['C']
.apply(lambda s: np.array(list(s)).mean(axis=0))
.to_frame('C')
.assign(count=df[['A', 'B']].value_counts())
.reset_index())
def pd_concat(df):
g = df.groupby(['A', 'B'])['C']
return pd.concat(
(
g.apply(lambda s: np.array(list(s)).mean(axis=0)),
g.size()
), axis=1,
keys=['C', 'count']
).reset_index()
def groupby_agg(df):
return df.groupby(['A', 'B'])['C'].agg(
[lambda s: np.array(list(s)).mean(axis=0).tolist(), 'size']
).reset_index().rename(columns={'<lambda_0>': 'C', 'size': 'count'})
def agg_len_apply(df):
df = df.groupby(['A', 'B'], as_index=False).agg({'C': list})
df['count'] = df['C'].str.len()
df['C'] = df['C'].apply(lambda c: np.array(c).mean(axis=0))
return df
def agg_map(df):
return df.groupby('B')['C'].agg(
mean=(lambda x: x.map(np.array).mean().tolist()),
size=(lambda x: x.size))
if __name__ == '__main__':
out = perfplot.bench(
setup=gen_data,
kernels=[
chain_assign,
pd_concat,
groupby_agg,
agg_len_apply,
to_frame_value_counts,
agg_map
],
labels=[
'chain_assign (Anurag Dabas)',
'to_frame_value_counts (Anurag Dabas)',
'pd_concat (Henry Ecker)',
'groupby_agg (Henry Ecker)',
'agg_len_apply (SomeDude)',
'agg_map (rhug123)'
],
n_range=[2 ** k for k in range(20)],
equality_check=None
)
out.save('perfplot_results.png', transparent=False)