Search code examples
pythonpandasnumpyperformance

How to explain pandas higher performances compared to numpy with 500k+ rows?


In some sources, I found that pandas works faster than numpy with 500k rows or more. Can someone explain this to me?

Pandas have a better performance when the number of rows is 500K or more.

Difference between Pandas VS NumPy - GeeksforGeeks

If the number of rows of the dataset is more than five hundred thousand (500K), then the performance of Pandas is better than NumPy.

Pandas Vs NumPy: What’s The Difference? [2023] - InterviewBit

[...] Pandas generally performs better than numpy for 500K rows or more [...]

PandaPy - firmai on GitHub

I tried to find where this fact came from. I couldn't figure it out and couldn't see any information from the documentation.


Solution

  • Adding to the discussion, here are those tests in the linked page reproduced with some minor changes to see if anything has changed since that original post was made almost 8 years ago and python and many of its libraries have upgraded quite a bit since then. According to python.org the newest version of python available at the time of his post was 3.6 .

    Here is the source code, copied from the linked page and updated to be runnable as posted here, plus a few minor changes for convenience.

    import pandas
    import matplotlib.pyplot as plt
    import seaborn
    import numpy
    
    import sys
    import time
    
    NUMBER_OF_ITERATIONS = 10
    FIGURE_NUMBER = 0
    
    def bench_sub(mode1_inputs: list, mode1_statement: str, mode2_inputs: list, mode2_statement: str) -> tuple[bool, list[float], list[float]]:
    
        mode1_results = []
        mode1_times = []
        mode2_results = []
        mode2_times = []
    
        for inputs, statementi, results, times in (
                (mode1_inputs, mode1_statement, mode1_results, mode1_times),
                (mode2_inputs, mode2_statement, mode2_results, mode2_times)
        ):
            for inputi in inputs:
                ast = compile(statementi, '<string>', 'exec')
                ast_locals = {'data': inputi}
                start_time = time.perf_counter_ns()
                for _ in range(NUMBER_OF_ITERATIONS):
                    exec(ast, locals=ast_locals)
                end_time = time.perf_counter_ns()
    
                results.append(ast_locals['res'])
                times.append((end_time - start_time) / 10 ** 9 / NUMBER_OF_ITERATIONS)
    
        passing = True
        for results1, results2 in zip(mode1_results, mode2_results):
            if not passing:
                break
            try:
                if type(results1) in [pandas.Series, numpy.ndarray] and type(results2) in [pandas.Series, numpy.ndarray]:
                    if type(results1[0]) is str:
                        isclose = set(results1) == set(results2)
                    else:
                        isclose = numpy.isclose(results1, results2).all()
                else:
                    isclose = numpy.isclose(results1, results2)
                if not isclose:
                    passing = False
                    break
            except (ValueError, TypeError):
                print(type(results1))
                print(results1)
                print(type(results2))
                print(results2)
                raise
        return passing, mode1_times, mode2_times
    
    def bench_sub_plot(mode1_inputs: list, mode1_statement: str, mode2_inputs: list, mode2_statement: str, title: str, label1: str, label2: str, save_fig: bool = True) -> tuple[bool, list[float], list[float]]:
        passing, mode1_times, mode2_times = bench_sub(mode1_inputs, mode1_statement, mode2_inputs, mode2_statement)
    
        fig, ax = plt.subplots(2, dpi=100, figsize=(8, 6))
        mode1_x = [len(x) for x in mode1_inputs]
        mode2_x = [len(x) for x in mode2_inputs]
    
        ax[0].plot(mode1_x, mode1_times, marker='o', markerfacecolor='none', label=label1)
        ax[0].plot(mode2_x, mode2_times, marker='^', markerfacecolor='none', label=label2)
        ax[0].set_xscale('log')
        ax[0].set_yscale('log')
        ax[0].legend()
        ax[0].set_title(title + f' : {"PASS" if passing else "FAIL"}')
        ax[0].set_xlabel('Number of records')
        ax[0].set_ylabel('Time [s]')
    
        if mode1_x == mode2_x:
            mode_comp = [x / y for x, y in zip(mode1_times, mode2_times)]
            ax[1].plot(mode1_x, mode_comp, marker='o', markerfacecolor='none', label=f'{label1} / {label2}')
            ax[1].plot([min(mode1_x), max(mode1_x)], [1.0, 1.0], linestyle='dashed', color='#AAAAAA', label='parity')
            ax[1].set_xscale('log')
            ax[1].legend()
            ax[1].set_title(title + f' (ratio)\nValues <1 indicate {label1} is faster than {label2}')
            ax[1].set_xlabel('Number of records')
            ax[1].set_ylabel(f'{label1} / {label2}')
        plt.tight_layout()
        # plt.show()
    
        if save_fig:
            global FIGURE_NUMBER
            # https://stackoverflow.com/a/295152
            clean_title = ''.join([x for x in title if (x.isalnum() or x in '_-. ')])
            fig.savefig(f'outputs/{FIGURE_NUMBER:06}_{clean_title}.png')
            FIGURE_NUMBER += 1
    
        return passing, mode1_times, mode2_times
    
    def _print_result_comparison(success: bool, times1: list[float], times2: list[float], input_lengths: list[int], title: str, label1: str, label2: str):
        print(title)
        print(f'  Test result: {"PASS" if success else "FAIL"}')
        field_width = 15
        print(f'{"# of records":>{field_width}} {label1 + " [ms]":>{field_width}} {label2 + " [ms]":>{field_width}} {"ratio":>{field_width}}')
        for input_length, time1, time2 in zip(input_lengths, times1, times2):
            print(f'{input_length:>{field_width}} {time1 * 1000:>{field_width}.03f} {time2 * 1000:>{field_width}.03f} {time1 / time2:>{field_width}.03f}')
        print()
    
    def bench_sub_plot_print(mode1_inputs: list, mode1_statement: str, mode2_inputs: list, mode2_statement: str, title: str, label1: str, label2: str, all_lengths: list[int], save_fig: bool = True) -> tuple[bool, list[float], list[float]]:
        success, times1, times2 = bench_sub_plot(
            mode1_inputs,
            mode1_statement,
            mode2_inputs,
            mode2_statement,
            title,
            label1,
            label2,
            True
        )
        _print_result_comparison(success, times1, times2, all_lengths, title, label1, label2)
        return success, times1, times2
    
    
    def _main():
    
        start_time = time.perf_counter_ns()
    
        # In [2]:
        iris = seaborn.load_dataset('iris')
    
    
        # In [3]:
        data_pandas: list[pandas.DataFrame] = []
        data_numpy: list[numpy.rec.recarray] = []
        all_lengths = [10_000, 100_000, 500_000, 1_000_000, 5_000_000, 10_000_000, 15_000_000]
        # all_lengths = [10_000, 100_000, 500_000] #, 1_000_000, 5_000_000, 10_000_000, 15_000_000]
        for total_len in all_lengths:
            data_pandas_i = pandas.concat([iris] * (total_len // len(iris)))
            data_pandas_i = pandas.concat([data_pandas_i, iris[:total_len - len(data_pandas_i)]])
            data_pandas.append(data_pandas_i)
            data_numpy.append(data_pandas_i.to_records())
    
        # In [4]:
        print('Input sizes [count]:')
        print(f'{"#":>4} {"pandas":>9} {"numpy":>9}')
        for i, (data_pandas_i, data_numpy_i) in enumerate(zip(data_pandas, data_numpy)):
            print(f'{i:>4} {len(data_pandas_i):>9} {len(data_numpy_i):>9}')
        print()
    
        # In [5]:
        mb_size_in_bytes = 1024 * 1024
        print('Data sizes [MB]:')
        print(f'{"#":>4} {"pandas":>9} {"numpy":>9}')
        for i, (data_pandas_i, data_numpy_i) in enumerate(zip(data_pandas, data_numpy)):
            print(f'{i:>4} {int(sys.getsizeof(data_pandas_i) / mb_size_in_bytes):>9} {int(sys.getsizeof(data_numpy_i) / mb_size_in_bytes):>9}')
        print()
    
        # In [6]:
        print(data_pandas[0].head())
        print()
    
        # In [7]:
        # ...
    
        # In [8]:
        success, times_pandas, times_numpy = bench_sub_plot_print(
            data_pandas,
            'res = data.loc[:, "sepal_length"].mean()',
            data_numpy,
            'res = numpy.mean(data.sepal_length)',
            'Mean on Unfiltered Column',
            'pandas',
            'numpy',
            all_lengths,
            True
        )
    
        # In [9]:
        success, times_pandas, times_numpy = bench_sub_plot_print(
            data_pandas,
            'res = numpy.log(data.loc[:, "sepal_length"])',
            data_numpy,
            'res = numpy.log(data.sepal_length)',
            'Vectorised log on Unfiltered Column',
            'pandas',
            'numpy',
            all_lengths,
            True
        )
    
        # In [10]:
        success, times_pandas, times_numpy = bench_sub_plot_print(
            data_pandas,
            'res = data.loc[:, "species"].unique()',
            data_numpy,
            'res = numpy.unique(data.species)',
            'Unique on Unfiltered String Column',
            'pandas',
            'numpy',
            all_lengths,
            True
        )
    
        # In [11]:
        success, times_pandas, times_numpy = bench_sub_plot_print(
            data_pandas,
            'res = data.loc[(data.sepal_width > 3) & (data.petal_length < 1.5), "sepal_length"].mean()',
            data_numpy,
            'res = numpy.mean(data[(data.sepal_width > 3) & (data.petal_length < 1.5)].sepal_length)',
            'Mean on Filtered Column',
            'pandas',
            'numpy',
            all_lengths,
            True
        )
    
        # In [12]:
        success, times_pandas, times_numpy = bench_sub_plot_print(
            data_pandas,
            'res = numpy.log(data.loc[(data.sepal_width > 3) & (data.petal_length < 1.5), "sepal_length"])',
            data_numpy,
            'res = numpy.log(data[(data.sepal_width > 3) & (data.petal_length < 1.5)].sepal_length)',
            'Vectorised log on Filtered Column',
            'pandas',
            'numpy',
            all_lengths,
            True
        )
    
        # In [13]:
        success, times_pandas, times_numpy = bench_sub_plot_print(
            data_pandas,
            'res = data[data.species == "setosa"].sepal_length.mean()',
            data_numpy,
            'res = numpy.mean(data[data.species == "setosa"].sepal_length)',
            'Mean on (String) Filtered Column',
            'pandas',
            'numpy',
            all_lengths,
            True
        )
    
        # In [14]:
        success, times_pandas, times_numpy = bench_sub_plot_print(
            data_pandas,
            'res = data.petal_length * data.sepal_length + data.petal_width * data.sepal_width',
            data_numpy,
            'res = data.petal_length * data.sepal_length + data.petal_width * data.sepal_width',
            'Vectorized Math on Unfiltered Column',
            'pandas',
            'numpy',
            all_lengths,
            True
        )
    
        # In [16]:
        success, times_pandas, times_numpy = bench_sub_plot_print(
            data_pandas,
            'res = data.loc[data.sepal_width * data.petal_length > data.sepal_length, "sepal_length"].mean()',
            data_numpy,
            'res = numpy.mean(data[data.sepal_width * data.petal_length > data.sepal_length].sepal_length)',
            'Vectorized Math in Filtering Column',
            'pandas',
            'numpy',
            all_lengths,
            True
        )
    
        end_time = time.perf_counter_ns()
        print(f'Total run time: {(end_time - start_time) / 10 ** 9:.3f} s')
    
    if __name__ == '__main__':
        _main()
    

    Here is the console output it generates:

    Input sizes [count]:
       #    pandas     numpy
       0     10000     10000
       1    100000    100000
       2    500000    500000
       3   1000000   1000000
       4   5000000   5000000
       5  10000000  10000000
       6  15000000  15000000
    
    Data sizes [MB]:
       #    pandas     numpy
       0         0         0
       1         9         4
       2        46        22
       3        92        45
       4       464       228
       5       928       457
       6      1392       686
    
       sepal_length  sepal_width  petal_length  petal_width species
    0           5.1          3.5           1.4          0.2  setosa
    1           4.9          3.0           1.4          0.2  setosa
    2           4.7          3.2           1.3          0.2  setosa
    3           4.6          3.1           1.5          0.2  setosa
    4           5.0          3.6           1.4          0.2  setosa
    
    Mean on Unfiltered Column
      Test result: PASS
       # of records     pandas [ms]      numpy [ms]           ratio
              10000           0.061           0.033           1.855
             100000           0.160           0.148           1.081
             500000           0.653           1.074           0.608
            1000000           1.512           2.440           0.620
            5000000          11.633          12.558           0.926
           10000000          23.954          25.360           0.945
           15000000          35.362          40.108           0.882
    
    Vectorised log on Unfiltered Column
      Test result: PASS
       # of records     pandas [ms]      numpy [ms]           ratio
              10000           0.124           0.056           2.190
             100000           0.507           0.493           1.029
             500000           3.399           3.441           0.988
            1000000           5.396           6.867           0.786
            5000000          27.187          38.121           0.713
           10000000          55.497          72.609           0.764
           15000000          88.406         112.199           0.788
    
    Unique on Unfiltered String Column
      Test result: PASS
       # of records     pandas [ms]      numpy [ms]           ratio
              10000           0.332           1.742           0.191
             100000           2.885          21.833           0.132
             500000          14.769         125.961           0.117
            1000000          29.687         264.521           0.112
            5000000         147.359        1501.378           0.098
           10000000         295.118        3132.478           0.094
           15000000         444.365        4882.316           0.091
    
    Mean on Filtered Column
      Test result: PASS
       # of records     pandas [ms]      numpy [ms]           ratio
              10000           0.355           0.130           2.719
             100000           0.522           0.672           0.777
             500000           1.797           4.824           0.372
            1000000           4.602          10.827           0.425
            5000000          22.116          57.945           0.382
           10000000          43.076         116.028           0.371
           15000000          68.893         177.658           0.388
    
    Vectorised log on Filtered Column
      Test result: PASS
       # of records     pandas [ms]      numpy [ms]           ratio
              10000           0.361           0.128           2.821
             100000           0.576           0.758           0.760
             500000           2.066           5.199           0.397
            1000000           5.259          11.523           0.456
            5000000          22.785          59.581           0.382
           10000000          47.527         121.882           0.390
           15000000          75.080         187.954           0.399
    
    Mean on (String) Filtered Column
      Test result: PASS
       # of records     pandas [ms]      numpy [ms]           ratio
              10000           0.636           0.192           3.304
             100000           4.068           1.743           2.334
             500000          20.954           9.306           2.252
            1000000          41.938          18.522           2.264
            5000000         217.254          97.929           2.218
           10000000         434.242         197.289           2.201
           15000000         657.205         297.919           2.206
    
    Vectorized Math on Unfiltered Column
      Test result: PASS
       # of records     pandas [ms]      numpy [ms]           ratio
              10000           0.168           0.049           3.415
             100000           0.385           0.338           1.140
             500000           3.193           5.018           0.636
            1000000           6.028           9.539           0.632
            5000000          32.640          48.235           0.677
           10000000          69.748          99.893           0.698
           15000000         107.528         159.040           0.676
    
    Vectorized Math in Filtering Column
      Test result: PASS
       # of records     pandas [ms]      numpy [ms]           ratio
              10000           0.350           0.234           1.500
             100000           0.926           2.494           0.371
             500000           6.093          15.007           0.406
            1000000          12.641          30.021           0.421
            5000000          71.714         163.060           0.440
           10000000         145.373         326.206           0.446
           15000000         227.817         490.991           0.464
    
    Total run time: 183.198 s
    

    And here are the plots it generated:

    mean on unfiltered column


    vectorised log on unfiltered column


    unique on unfiltered string column


    mean on filtered column


    vectorised log on filtered column


    mean on string filtered column


    vectorised math on unfiltered column


    vectorised math in filtering column


    These results were generated with Windows 10, Python 3.13, on i9-10900K, and never got close to running out of memory so swap should not be a factor.