I have a LazyFrame with multiple columns of hourly data for a few periods. For each period, I want to find the x value for a function involving mathematical operations of multiple columns that minimizes the result.
I used scipy.optimize.minimize to accomplish this, and I actually obtain the desired result. The problem is that this process runs extremely slowly, so I'm just looking for anything that accomplishes the same, but faster.
def minimization_target(x, period_start):
return hourly_data.filter(pl.col('period_start') == period_start).select((((pl.col('price').median() * pl.col('quantity').median() - (pl.col('estimated_quantity') * (pl.col('estimated_price') + x)).sum()) / (pl.col('key_product') * (pl.col('estimated_price') + x)).sum())).abs() - 1).abs()).collect().item()
results = hourly_data.group_by('period_start', maintain_order=True).map_groups(lambda group: pl.DataFrame({'x_values': scipy.optimize.minimize(minimization_target, group.get_column('initial_guess').median(), args=group.get_column('period_start').median()).x}), schema=None)
Minimal example:
import scipy
import polars as pl
from datetime import datetime
hourly_data = pl.DataFrame({'period': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], 'price': [4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7], 'quantity': [7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4], 'estimated_price': [5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8], 'estimated_quantity': [6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3], 'key_product': [0.9, 0.8, 0.7, 0.8, 0.9, 0.8, 0.7, 0.8, 0.9, 0.8, 0.7, 0.8, 0.9, 0.8, 0.7, 0.8, 0.9, 0.8, 0.7, 0.8, 0.9, 0.8, 0.7, 0.8], 'initial_guess': [10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 30, 30, 30, 30, 30, 30, 40, 40, 40, 40, 40, 40]}).lazy()
hourly_data = hourly_data.with_columns(pl.datetime_range(datetime(2024, 1, 1), datetime(2024, 1, 1, 23), '1h').alias('hour'))
hourly_data = hourly_data.with_columns(pl.col('hour').min().over('period').alias('period_start'))
def minimization_target(x, period_start):
return hourly_data.filter(pl.col('period_start') == period_start).select((((pl.col('price').median() * pl.col('quantity').median() - (pl.col('estimated_quantity') * (pl.col('estimated_price') + x)).sum()) / (pl.col('key_product') * (pl.col('estimated_price') + x)).sum()).abs() - 1).abs()).collect().item()
results = hourly_data.group_by('period_start', maintain_order=True).map_groups(lambda group: pl.DataFrame({'x_values': scipy.optimize.minimize(minimization_target, group.get_column('initial_guess').median(), args=group.get_column('period_start').median()).x}), schema=None)
Input:
shape: (24, 9)
┌────────┬───────┬──────────┬─────────────┬───┬─────────────┬────────────┬────────────┬────────────┐
│ period ┆ price ┆ quantity ┆ estimated_p ┆ … ┆ key_product ┆ initial_gu ┆ hour ┆ period_sta │
│ --- ┆ --- ┆ --- ┆ rice ┆ ┆ --- ┆ ess ┆ --- ┆ rt │
│ i64 ┆ i64 ┆ i64 ┆ --- ┆ ┆ f64 ┆ --- ┆ datetime[μ ┆ --- │
│ ┆ ┆ ┆ i64 ┆ ┆ ┆ i64 ┆ s] ┆ datetime[μ │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ s] │
╞════════╪═══════╪══════════╪═════════════╪═══╪═════════════╪════════════╪════════════╪════════════╡
│ 0 ┆ 4 ┆ 7 ┆ 5 ┆ … ┆ 0.9 ┆ 10 ┆ 2024-01-01 ┆ 2024-01-01 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 00:00:00 ┆ 00:00:00 │
│ 0 ┆ 4 ┆ 7 ┆ 5 ┆ … ┆ 0.8 ┆ 10 ┆ 2024-01-01 ┆ 2024-01-01 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 01:00:00 ┆ 00:00:00 │
│ 0 ┆ 4 ┆ 7 ┆ 5 ┆ … ┆ 0.7 ┆ 10 ┆ 2024-01-01 ┆ 2024-01-01 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 02:00:00 ┆ 00:00:00 │
│ 0 ┆ 4 ┆ 7 ┆ 5 ┆ … ┆ 0.8 ┆ 10 ┆ 2024-01-01 ┆ 2024-01-01 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 03:00:00 ┆ 00:00:00 │
│ 0 ┆ 4 ┆ 7 ┆ 5 ┆ … ┆ 0.9 ┆ 10 ┆ 2024-01-01 ┆ 2024-01-01 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 04:00:00 ┆ 00:00:00 │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ 3 ┆ 7 ┆ 4 ┆ 8 ┆ … ┆ 0.8 ┆ 40 ┆ 2024-01-01 ┆ 2024-01-01 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 19:00:00 ┆ 18:00:00 │
│ 3 ┆ 7 ┆ 4 ┆ 8 ┆ … ┆ 0.9 ┆ 40 ┆ 2024-01-01 ┆ 2024-01-01 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 20:00:00 ┆ 18:00:00 │
│ 3 ┆ 7 ┆ 4 ┆ 8 ┆ … ┆ 0.8 ┆ 40 ┆ 2024-01-01 ┆ 2024-01-01 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 21:00:00 ┆ 18:00:00 │
│ 3 ┆ 7 ┆ 4 ┆ 8 ┆ … ┆ 0.7 ┆ 40 ┆ 2024-01-01 ┆ 2024-01-01 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 22:00:00 ┆ 18:00:00 │
│ 3 ┆ 7 ┆ 4 ┆ 8 ┆ … ┆ 0.8 ┆ 40 ┆ 2024-01-01 ┆ 2024-01-01 │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 23:00:00 ┆ 18:00:00 │
└────────┴───────┴──────────┴─────────────┴───┴─────────────┴────────────┴────────────┴────────────┘
Desired output:
shape: (4, 1)
┌────────────┐
│ x_values │
│ --- │
│ f64 │
╞════════════╡
│ -16.006287 │
│ 10.331055 │
│ 25.420471 │
│ 37.352234 │
└────────────┘
Avoid working with entire dataframe.
As a start, you can cut the runtime by a factor of ~2 by only using the current group in the optimization target instead of filtering the entire dataframe first.
This can be achieved as follows.
import functools
def minimization_target(x, group: pl.DataFrame):
numerator_expr = pl.col('price').median() * pl.col('quantity').median() - (pl.col('estimated_quantity') * (pl.col('estimated_price') + x)).sum()
denominator_expr = (pl.col('key_product') * (pl.col('estimated_price') + x)).sum()
return (
group.select(
(
(numerator_expr / denominator_expr).abs() - 1
).abs()
)
.item()
)
(
hourly_data
.group_by('period_start', maintain_order=True)
.map_groups(
lambda group: pl.DataFrame({"x": scipy.optimize.minimize(functools.partial(minimization_target, group=group), group.get_column('initial_guess').median()).x}),
schema=None,
)
.collect()
)
Simplifying minimization target.
We'd like the minimization target to be as simple as possible as scipy.optimize.minimize
will repeatedly evaluate the function. The current implementation works with "heavy" DataFrame objects and computes many terms common to all evaluations.
Using a simpler minimization target while precomuting these terms yields another ~4x speedup on top.
import numpy as np
def minimization_target(x: float, s1: float, f1: float, s2: float, f2: float):
return np.abs(np.abs((s1 - f1 * x) / (s2 + f2 * x)) - 1)
(
hourly_data
.group_by('period_start', maintain_order=True)
.map_groups(
lambda group: pl.DataFrame({
"x": scipy.optimize.minimize(
fun=partial(
minimization_target,
# precomputing terms common to all evaluations of the minimization target
s1=group.select(pl.col('price').median() * pl.col('quantity').median() - (pl.col('estimated_quantity') * pl.col('estimated_price')).sum()).item(),
f1=group.select(pl.col("estimated_quantity").sum()).item(),
s2=group.select((pl.col('key_product') * pl.col('estimated_price')).sum()).item(),
f2=group.select(pl.col("key_product").sum()).item(),
),
x0=group.get_column('initial_guess').median()
).x
}),
schema=None,
)
.collect()
)