Search code examples
pythonpandasdataframenumpyperformance

Most efficient way to combine information from each row of groups in a pandas groupby


Consider the following code. The purpose of this code is to go through a groupby, and combine information from each of the rows in the groups and save it into a pd.Series.

import numpy as np
df = pd.DataFrame({"A" : ["a", "a", "b", "b", "a"], "B" : ["x", "y", "z", "w", "t"] })
groups = df.groupby("A")
strings = groups["B"].apply(lambda x : "".join(sorted(x.values + " ")))
strings2 = groups["B"].apply(lambda x : str(len(np.unique(x))) + "hello")
print(strings)
print(strings2)

Outputs

a    t x y 
b      w z 
Name: B, dtype: object
A
a    3hello
b    2hello
Name: B, dtype: object

For example, the first apply statement in the code sample above is to concatenate the rows of column "B" into one string, with spaces added.

I would like to make this code as fast as possible and to be able to scale it to the setting of multiple millions of rows.

This is the most efficient way I've been able to find so far to do this task, but I think part of the problem is that .apply is not vectorised in pandas, so applying this to each group is a very slow process (typically, there are more than 300,000 groups). Each group only has size 2 or 3.

Alternatively I could try skipping the groupby and doing this directly on the original DataFrame. A solution that works this way would also be appreciated.

In summary, I would like a way to rewrite the above code that achieves the same thing, but as fast and space efficient as possible (scalable to extremely large datasets).


Solution

  • The fastest solution for strings2 I found was:

    def cast_multiple_apply_approach(dfg):
        return dfg["B"].apply(pd.unique).map(len).map(repr) + "hello"
    

    For strings2 the OP's method is quite slow:

    import numpy as np
    import pandas as pd
    
    _rng = np.random.default_rng(123)
    
    def setup(N):
        x = ["a", "a", "b", "b", "a"] * N
        _rng.shuffle(x)
        y = ["x", "y", "z", "w", "t"] * N
        _rng.shuffle(y)
        df = pd.DataFrame({"A": x, "B": y})
    
        groups = df.groupby("A")
        return [groups]
    
    
    def original_poster_approach(dfg):
        return dfg["B"].apply(lambda x: str(len(np.unique(x))) + "hello")
    
    
    def bracula_approach(dfg):
        return dfg["B"].apply(lambda x: f"{len(pd.unique(x))}hello")
    
    
    def len_multiple_apply_approach(dfg):
        return dfg["B"].apply(pd.unique).map(len).astype("string") + "hello"
    
    
    def npsize_multiple_apply_approach(dfg):
        return dfg["B"].apply(pd.unique).map(np.size).astype("string") + "hello"
    
    
    def set_multiple_apply_approach(dfg):
        return dfg["B"].apply(set).map(len).astype("string") + "hello"
    
    
    def cast_multiple_apply_approach(dfg):
        return dfg["B"].apply(pd.unique).map(len).map(repr) + "hello"
    
    
    approaches = [
        original_poster_approach,
        bracula_approach,
        len_multiple_apply_approach,
        npsize_multiple_apply_approach,
        set_multiple_apply_approach,
        cast_multiple_apply_approach,
    ]
    for approach in approaches[1:]:
        data = setup(100)
        assert (approach(*data) == approaches[0](*data)).all()
    
    
    run_performance_comparison(
        approaches,
        [
            1000,
            3000,
            5000,
            10000,
            30000,
            100_000,
        ],  # ,300_000,500_000,1_000_000],#3_000_000,5_000_000,10_000_000],
        setup=setup,
        title="Performance Comparison",
        number_of_repetitions=1,
    )
    

    enter image description here

    Therefore I excluded it from running on a large sample:

    enter image description here

    For strings, I was surprised to find that:

    • OP's method is actually really hard to beat
    • Sorting beforehand is terrible

    I could improve a little on OP's solution by using native pd.Series.str.join over the joining in pure Python:

    def pandas_str_accessor_approach(df):
        return df.groupby("A")["B"].apply(sorted).str.join(" ")
    
    def setup(N):
        x = ["a", "a", "b", "b", "a"] * N
        _rng.shuffle(x)
        y = ["x", "y", "z", "w", "t"] * N
        _rng.shuffle(y)
        df = pd.DataFrame({"A": x, "B": y})
        return [df]
    
    
    def original_poster_approach(df):
        return df.groupby("A")["B"].apply(lambda x: "".join(sorted(x.values + " ")))
    
    
    def pandas_str_accessor_approach(df):
        return df.groupby("A")["B"].apply(sorted).str.join(" ")
    
    
    def sort_first_approach(df):
        df.sort_values(by=["B"], inplace=True)
        strings = df.groupby("A")["B"].apply(" ".join)
        return strings
    
    
    approaches = [
        original_poster_approach,
        pandas_str_accessor_approach,
        sort_first_approach,
    ]
    
    run_performance_comparison(
        approaches,
        [
            1000,
            3000,
            5000,
            10000,
            30000,
            100_000,
            300_000,
            500_000,
            1_000_000,
            3_000_000,
        ],  # 5_000_000,10_000_000],
        setup=setup,
        title="Performance Comparison",
        number_of_repetitions=1,
    )
    

    enter image description here

    enter image description here

    Profiling code used:

    import timeit
    import matplotlib.pyplot as plt
    from typing import List, Dict, Callable
    
    from contextlib import contextmanager
    
    
    @contextmanager
    def data_provider(data_size, setup=lambda N: N, teardown=lambda: None):
        data = setup(data_size)
        yield data
        teardown()
    
    
    def run_performance_comparison(approaches: List[Callable],
                                   data_size: List[int],
                                   setup=lambda N: N,
                                   teardown=lambda: None,
                                   number_of_repetitions=5, title='Performance Comparison',data_name='N'):
        approach_times: Dict[Callable, List[float]] = {approach: [] for approach in approaches}
    
        for N in data_size:
            with data_provider(N, setup, teardown) as data:
                for approach in approaches:
                    approach_time = timeit.timeit(lambda: approach(*data), number=number_of_repetitions)
                    approach_times[approach].append(approach_time)
    
        for approach in approaches:
            plt.plot(data_size, approach_times[approach], label=approach.__name__)
        plt.yscale('log')
        plt.xscale('log')
    
        plt.xlabel(data_name)
        plt.ylabel('Execution Time (seconds)')
        plt.title(title)
        plt.legend()
        plt.show()