python pandas dataframe numpy performance

Most efficient way to combine information from each row of groups in a pandas groupby

Consider the following code. The purpose of this code is to go through a groupby, and combine information from each of the rows in the groups and save it into a pd.Series.

import numpy as np
df = pd.DataFrame({"A" : ["a", "a", "b", "b", "a"], "B" : ["x", "y", "z", "w", "t"] })
groups = df.groupby("A")
strings = groups["B"].apply(lambda x : "".join(sorted(x.values + " ")))
strings2 = groups["B"].apply(lambda x : str(len(np.unique(x))) + "hello")
print(strings)
print(strings2)

Outputs

a    t x y 
b      w z 
Name: B, dtype: object
A
a    3hello
b    2hello
Name: B, dtype: object

For example, the first apply statement in the code sample above is to concatenate the rows of column "B" into one string, with spaces added.

I would like to make this code as fast as possible and to be able to scale it to the setting of multiple millions of rows.

This is the most efficient way I've been able to find so far to do this task, but I think part of the problem is that .apply is not vectorised in pandas, so applying this to each group is a very slow process (typically, there are more than 300,000 groups). Each group only has size 2 or 3.

Alternatively I could try skipping the groupby and doing this directly on the original DataFrame. A solution that works this way would also be appreciated.

In summary, I would like a way to rewrite the above code that achieves the same thing, but as fast and space efficient as possible (scalable to extremely large datasets).

Solution

The fastest solution for strings2 I found was:

def cast_multiple_apply_approach(dfg):
    return dfg["B"].apply(pd.unique).map(len).map(repr) + "hello"

For strings2 the OP's method is quite slow:

import numpy as np
import pandas as pd

_rng = np.random.default_rng(123)

def setup(N):
    x = ["a", "a", "b", "b", "a"] * N
    _rng.shuffle(x)
    y = ["x", "y", "z", "w", "t"] * N
    _rng.shuffle(y)
    df = pd.DataFrame({"A": x, "B": y})

    groups = df.groupby("A")
    return [groups]


def original_poster_approach(dfg):
    return dfg["B"].apply(lambda x: str(len(np.unique(x))) + "hello")


def bracula_approach(dfg):
    return dfg["B"].apply(lambda x: f"{len(pd.unique(x))}hello")


def len_multiple_apply_approach(dfg):
    return dfg["B"].apply(pd.unique).map(len).astype("string") + "hello"


def npsize_multiple_apply_approach(dfg):
    return dfg["B"].apply(pd.unique).map(np.size).astype("string") + "hello"


def set_multiple_apply_approach(dfg):
    return dfg["B"].apply(set).map(len).astype("string") + "hello"


def cast_multiple_apply_approach(dfg):
    return dfg["B"].apply(pd.unique).map(len).map(repr) + "hello"


approaches = [
    original_poster_approach,
    bracula_approach,
    len_multiple_apply_approach,
    npsize_multiple_apply_approach,
    set_multiple_apply_approach,
    cast_multiple_apply_approach,
]
for approach in approaches[1:]:
    data = setup(100)
    assert (approach(*data) == approaches[0](*data)).all()


run_performance_comparison(
    approaches,
    [
        1000,
        3000,
        5000,
        10000,
        30000,
        100_000,
    ],  # ,300_000,500_000,1_000_000],#3_000_000,5_000_000,10_000_000],
    setup=setup,
    title="Performance Comparison",
    number_of_repetitions=1,
)

Therefore I excluded it from running on a large sample:

For strings, I was surprised to find that:

OP's method is actually really hard to beat
Sorting beforehand is terrible

I could improve a little on OP's solution by using native pd.Series.str.join over the joining in pure Python:

def pandas_str_accessor_approach(df):
    return df.groupby("A")["B"].apply(sorted).str.join(" ")

def setup(N):
    x = ["a", "a", "b", "b", "a"] * N
    _rng.shuffle(x)
    y = ["x", "y", "z", "w", "t"] * N
    _rng.shuffle(y)
    df = pd.DataFrame({"A": x, "B": y})
    return [df]


def original_poster_approach(df):
    return df.groupby("A")["B"].apply(lambda x: "".join(sorted(x.values + " ")))


def pandas_str_accessor_approach(df):
    return df.groupby("A")["B"].apply(sorted).str.join(" ")


def sort_first_approach(df):
    df.sort_values(by=["B"], inplace=True)
    strings = df.groupby("A")["B"].apply(" ".join)
    return strings


approaches = [
    original_poster_approach,
    pandas_str_accessor_approach,
    sort_first_approach,
]

run_performance_comparison(
    approaches,
    [
        1000,
        3000,
        5000,
        10000,
        30000,
        100_000,
        300_000,
        500_000,
        1_000_000,
        3_000_000,
    ],  # 5_000_000,10_000_000],
    setup=setup,
    title="Performance Comparison",
    number_of_repetitions=1,
)

Profiling code used:

import timeit
import matplotlib.pyplot as plt
from typing import List, Dict, Callable

from contextlib import contextmanager


@contextmanager
def data_provider(data_size, setup=lambda N: N, teardown=lambda: None):
    data = setup(data_size)
    yield data
    teardown()


def run_performance_comparison(approaches: List[Callable],
                               data_size: List[int],
                               setup=lambda N: N,
                               teardown=lambda: None,
                               number_of_repetitions=5, title='Performance Comparison',data_name='N'):
    approach_times: Dict[Callable, List[float]] = {approach: [] for approach in approaches}

    for N in data_size:
        with data_provider(N, setup, teardown) as data:
            for approach in approaches:
                approach_time = timeit.timeit(lambda: approach(*data), number=number_of_repetitions)
                approach_times[approach].append(approach_time)

    for approach in approaches:
        plt.plot(data_size, approach_times[approach], label=approach.__name__)
    plt.yscale('log')
    plt.xscale('log')

    plt.xlabel(data_name)
    plt.ylabel('Execution Time (seconds)')
    plt.title(title)
    plt.legend()
    plt.show()