In some sources, I found that pandas works faster than numpy with 500k rows or more. Can someone explain this to me?
Pandas have a better performance when the number of rows is 500K or more.
— Difference between Pandas VS NumPy - GeeksforGeeks
If the number of rows of the dataset is more than five hundred thousand (500K), then the performance of Pandas is better than NumPy.
— Pandas Vs NumPy: What’s The Difference? [2023] - InterviewBit
[...] Pandas generally performs better than numpy for 500K rows or more [...]
— PandaPy - firmai on GitHub
I tried to find where this fact came from. I couldn't figure it out and couldn't see any information from the documentation.
Adding to the discussion, here are those tests in the linked page reproduced with some minor changes to see if anything has changed since that original post was made almost 8 years ago and python and many of its libraries have upgraded quite a bit since then. According to python.org the newest version of python available at the time of his post was 3.6 .
Here is the source code, copied from the linked page and updated to be runnable as posted here, plus a few minor changes for convenience.
import pandas
import matplotlib.pyplot as plt
import seaborn
import numpy
import sys
import time
NUMBER_OF_ITERATIONS = 10
FIGURE_NUMBER = 0
def bench_sub(mode1_inputs: list, mode1_statement: str, mode2_inputs: list, mode2_statement: str) -> tuple[bool, list[float], list[float]]:
mode1_results = []
mode1_times = []
mode2_results = []
mode2_times = []
for inputs, statementi, results, times in (
(mode1_inputs, mode1_statement, mode1_results, mode1_times),
(mode2_inputs, mode2_statement, mode2_results, mode2_times)
):
for inputi in inputs:
ast = compile(statementi, '<string>', 'exec')
ast_locals = {'data': inputi}
start_time = time.perf_counter_ns()
for _ in range(NUMBER_OF_ITERATIONS):
exec(ast, locals=ast_locals)
end_time = time.perf_counter_ns()
results.append(ast_locals['res'])
times.append((end_time - start_time) / 10 ** 9 / NUMBER_OF_ITERATIONS)
passing = True
for results1, results2 in zip(mode1_results, mode2_results):
if not passing:
break
try:
if type(results1) in [pandas.Series, numpy.ndarray] and type(results2) in [pandas.Series, numpy.ndarray]:
if type(results1[0]) is str:
isclose = set(results1) == set(results2)
else:
isclose = numpy.isclose(results1, results2).all()
else:
isclose = numpy.isclose(results1, results2)
if not isclose:
passing = False
break
except (ValueError, TypeError):
print(type(results1))
print(results1)
print(type(results2))
print(results2)
raise
return passing, mode1_times, mode2_times
def bench_sub_plot(mode1_inputs: list, mode1_statement: str, mode2_inputs: list, mode2_statement: str, title: str, label1: str, label2: str, save_fig: bool = True) -> tuple[bool, list[float], list[float]]:
passing, mode1_times, mode2_times = bench_sub(mode1_inputs, mode1_statement, mode2_inputs, mode2_statement)
fig, ax = plt.subplots(2, dpi=100, figsize=(8, 6))
mode1_x = [len(x) for x in mode1_inputs]
mode2_x = [len(x) for x in mode2_inputs]
ax[0].plot(mode1_x, mode1_times, marker='o', markerfacecolor='none', label=label1)
ax[0].plot(mode2_x, mode2_times, marker='^', markerfacecolor='none', label=label2)
ax[0].set_xscale('log')
ax[0].set_yscale('log')
ax[0].legend()
ax[0].set_title(title + f' : {"PASS" if passing else "FAIL"}')
ax[0].set_xlabel('Number of records')
ax[0].set_ylabel('Time [s]')
if mode1_x == mode2_x:
mode_comp = [x / y for x, y in zip(mode1_times, mode2_times)]
ax[1].plot(mode1_x, mode_comp, marker='o', markerfacecolor='none', label=f'{label1} / {label2}')
ax[1].plot([min(mode1_x), max(mode1_x)], [1.0, 1.0], linestyle='dashed', color='#AAAAAA', label='parity')
ax[1].set_xscale('log')
ax[1].legend()
ax[1].set_title(title + f' (ratio)\nValues <1 indicate {label1} is faster than {label2}')
ax[1].set_xlabel('Number of records')
ax[1].set_ylabel(f'{label1} / {label2}')
plt.tight_layout()
# plt.show()
if save_fig:
global FIGURE_NUMBER
# https://stackoverflow.com/a/295152
clean_title = ''.join([x for x in title if (x.isalnum() or x in '_-. ')])
fig.savefig(f'outputs/{FIGURE_NUMBER:06}_{clean_title}.png')
FIGURE_NUMBER += 1
return passing, mode1_times, mode2_times
def _print_result_comparison(success: bool, times1: list[float], times2: list[float], input_lengths: list[int], title: str, label1: str, label2: str):
print(title)
print(f' Test result: {"PASS" if success else "FAIL"}')
field_width = 15
print(f'{"# of records":>{field_width}} {label1 + " [ms]":>{field_width}} {label2 + " [ms]":>{field_width}} {"ratio":>{field_width}}')
for input_length, time1, time2 in zip(input_lengths, times1, times2):
print(f'{input_length:>{field_width}} {time1 * 1000:>{field_width}.03f} {time2 * 1000:>{field_width}.03f} {time1 / time2:>{field_width}.03f}')
print()
def bench_sub_plot_print(mode1_inputs: list, mode1_statement: str, mode2_inputs: list, mode2_statement: str, title: str, label1: str, label2: str, all_lengths: list[int], save_fig: bool = True) -> tuple[bool, list[float], list[float]]:
success, times1, times2 = bench_sub_plot(
mode1_inputs,
mode1_statement,
mode2_inputs,
mode2_statement,
title,
label1,
label2,
True
)
_print_result_comparison(success, times1, times2, all_lengths, title, label1, label2)
return success, times1, times2
def _main():
start_time = time.perf_counter_ns()
# In [2]:
iris = seaborn.load_dataset('iris')
# In [3]:
data_pandas: list[pandas.DataFrame] = []
data_numpy: list[numpy.rec.recarray] = []
all_lengths = [10_000, 100_000, 500_000, 1_000_000, 5_000_000, 10_000_000, 15_000_000]
# all_lengths = [10_000, 100_000, 500_000] #, 1_000_000, 5_000_000, 10_000_000, 15_000_000]
for total_len in all_lengths:
data_pandas_i = pandas.concat([iris] * (total_len // len(iris)))
data_pandas_i = pandas.concat([data_pandas_i, iris[:total_len - len(data_pandas_i)]])
data_pandas.append(data_pandas_i)
data_numpy.append(data_pandas_i.to_records())
# In [4]:
print('Input sizes [count]:')
print(f'{"#":>4} {"pandas":>9} {"numpy":>9}')
for i, (data_pandas_i, data_numpy_i) in enumerate(zip(data_pandas, data_numpy)):
print(f'{i:>4} {len(data_pandas_i):>9} {len(data_numpy_i):>9}')
print()
# In [5]:
mb_size_in_bytes = 1024 * 1024
print('Data sizes [MB]:')
print(f'{"#":>4} {"pandas":>9} {"numpy":>9}')
for i, (data_pandas_i, data_numpy_i) in enumerate(zip(data_pandas, data_numpy)):
print(f'{i:>4} {int(sys.getsizeof(data_pandas_i) / mb_size_in_bytes):>9} {int(sys.getsizeof(data_numpy_i) / mb_size_in_bytes):>9}')
print()
# In [6]:
print(data_pandas[0].head())
print()
# In [7]:
# ...
# In [8]:
success, times_pandas, times_numpy = bench_sub_plot_print(
data_pandas,
'res = data.loc[:, "sepal_length"].mean()',
data_numpy,
'res = numpy.mean(data.sepal_length)',
'Mean on Unfiltered Column',
'pandas',
'numpy',
all_lengths,
True
)
# In [9]:
success, times_pandas, times_numpy = bench_sub_plot_print(
data_pandas,
'res = numpy.log(data.loc[:, "sepal_length"])',
data_numpy,
'res = numpy.log(data.sepal_length)',
'Vectorised log on Unfiltered Column',
'pandas',
'numpy',
all_lengths,
True
)
# In [10]:
success, times_pandas, times_numpy = bench_sub_plot_print(
data_pandas,
'res = data.loc[:, "species"].unique()',
data_numpy,
'res = numpy.unique(data.species)',
'Unique on Unfiltered String Column',
'pandas',
'numpy',
all_lengths,
True
)
# In [11]:
success, times_pandas, times_numpy = bench_sub_plot_print(
data_pandas,
'res = data.loc[(data.sepal_width > 3) & (data.petal_length < 1.5), "sepal_length"].mean()',
data_numpy,
'res = numpy.mean(data[(data.sepal_width > 3) & (data.petal_length < 1.5)].sepal_length)',
'Mean on Filtered Column',
'pandas',
'numpy',
all_lengths,
True
)
# In [12]:
success, times_pandas, times_numpy = bench_sub_plot_print(
data_pandas,
'res = numpy.log(data.loc[(data.sepal_width > 3) & (data.petal_length < 1.5), "sepal_length"])',
data_numpy,
'res = numpy.log(data[(data.sepal_width > 3) & (data.petal_length < 1.5)].sepal_length)',
'Vectorised log on Filtered Column',
'pandas',
'numpy',
all_lengths,
True
)
# In [13]:
success, times_pandas, times_numpy = bench_sub_plot_print(
data_pandas,
'res = data[data.species == "setosa"].sepal_length.mean()',
data_numpy,
'res = numpy.mean(data[data.species == "setosa"].sepal_length)',
'Mean on (String) Filtered Column',
'pandas',
'numpy',
all_lengths,
True
)
# In [14]:
success, times_pandas, times_numpy = bench_sub_plot_print(
data_pandas,
'res = data.petal_length * data.sepal_length + data.petal_width * data.sepal_width',
data_numpy,
'res = data.petal_length * data.sepal_length + data.petal_width * data.sepal_width',
'Vectorized Math on Unfiltered Column',
'pandas',
'numpy',
all_lengths,
True
)
# In [16]:
success, times_pandas, times_numpy = bench_sub_plot_print(
data_pandas,
'res = data.loc[data.sepal_width * data.petal_length > data.sepal_length, "sepal_length"].mean()',
data_numpy,
'res = numpy.mean(data[data.sepal_width * data.petal_length > data.sepal_length].sepal_length)',
'Vectorized Math in Filtering Column',
'pandas',
'numpy',
all_lengths,
True
)
end_time = time.perf_counter_ns()
print(f'Total run time: {(end_time - start_time) / 10 ** 9:.3f} s')
if __name__ == '__main__':
_main()
Here is the console output it generates:
Input sizes [count]:
# pandas numpy
0 10000 10000
1 100000 100000
2 500000 500000
3 1000000 1000000
4 5000000 5000000
5 10000000 10000000
6 15000000 15000000
Data sizes [MB]:
# pandas numpy
0 0 0
1 9 4
2 46 22
3 92 45
4 464 228
5 928 457
6 1392 686
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
Mean on Unfiltered Column
Test result: PASS
# of records pandas [ms] numpy [ms] ratio
10000 0.061 0.033 1.855
100000 0.160 0.148 1.081
500000 0.653 1.074 0.608
1000000 1.512 2.440 0.620
5000000 11.633 12.558 0.926
10000000 23.954 25.360 0.945
15000000 35.362 40.108 0.882
Vectorised log on Unfiltered Column
Test result: PASS
# of records pandas [ms] numpy [ms] ratio
10000 0.124 0.056 2.190
100000 0.507 0.493 1.029
500000 3.399 3.441 0.988
1000000 5.396 6.867 0.786
5000000 27.187 38.121 0.713
10000000 55.497 72.609 0.764
15000000 88.406 112.199 0.788
Unique on Unfiltered String Column
Test result: PASS
# of records pandas [ms] numpy [ms] ratio
10000 0.332 1.742 0.191
100000 2.885 21.833 0.132
500000 14.769 125.961 0.117
1000000 29.687 264.521 0.112
5000000 147.359 1501.378 0.098
10000000 295.118 3132.478 0.094
15000000 444.365 4882.316 0.091
Mean on Filtered Column
Test result: PASS
# of records pandas [ms] numpy [ms] ratio
10000 0.355 0.130 2.719
100000 0.522 0.672 0.777
500000 1.797 4.824 0.372
1000000 4.602 10.827 0.425
5000000 22.116 57.945 0.382
10000000 43.076 116.028 0.371
15000000 68.893 177.658 0.388
Vectorised log on Filtered Column
Test result: PASS
# of records pandas [ms] numpy [ms] ratio
10000 0.361 0.128 2.821
100000 0.576 0.758 0.760
500000 2.066 5.199 0.397
1000000 5.259 11.523 0.456
5000000 22.785 59.581 0.382
10000000 47.527 121.882 0.390
15000000 75.080 187.954 0.399
Mean on (String) Filtered Column
Test result: PASS
# of records pandas [ms] numpy [ms] ratio
10000 0.636 0.192 3.304
100000 4.068 1.743 2.334
500000 20.954 9.306 2.252
1000000 41.938 18.522 2.264
5000000 217.254 97.929 2.218
10000000 434.242 197.289 2.201
15000000 657.205 297.919 2.206
Vectorized Math on Unfiltered Column
Test result: PASS
# of records pandas [ms] numpy [ms] ratio
10000 0.168 0.049 3.415
100000 0.385 0.338 1.140
500000 3.193 5.018 0.636
1000000 6.028 9.539 0.632
5000000 32.640 48.235 0.677
10000000 69.748 99.893 0.698
15000000 107.528 159.040 0.676
Vectorized Math in Filtering Column
Test result: PASS
# of records pandas [ms] numpy [ms] ratio
10000 0.350 0.234 1.500
100000 0.926 2.494 0.371
500000 6.093 15.007 0.406
1000000 12.641 30.021 0.421
5000000 71.714 163.060 0.440
10000000 145.373 326.206 0.446
15000000 227.817 490.991 0.464
Total run time: 183.198 s
And here are the plots it generated:
These results were generated with Windows 10, Python 3.13, on i9-10900K, and never got close to running out of memory so swap should not be a factor.