I have below pandas snippet which I want to convert to polars to try, Expected output for polars is same as pandas but failing as cumsum is missing, how to achieve similar output?:
import pandas as pd
import numpy as np
data = {
'date': pd.date_range(start='2024-09-01', periods=12),
'reserved_before': [0, 1, 2, np.nan, np.nan, 1, 2, 3, np.nan, 3, 4, 5]
}
df = pd.DataFrame(data)
df['reserved'] = df['reserved_before'].notna()
def assign_group_id(series):
return (series != series.shift()).cumsum()
df['group'] = assign_group_id(df['reserved'])
def min_max(group):
non_nan = group['reserved_before'].dropna()
if len(non_nan) > 0:
return pd.Series({'min': non_nan.min(), 'max': non_nan.max()})
return pd.Series({'min': np.nan, 'max': np.nan})
result = df.groupby('group').apply(min_max).reset_index()
df = df.merge(result, on='group', how='left')
df = df.drop(columns=['group', 'reserved'])
df = df.rename(columns={'min': 'block_min', 'max': 'block_max'})
Expected output for polars is same as pandas but failing as cumsum is missing, how to achieve similar output?:
import polars as pl
from datetime import date
df = pl.DataFrame({
'date': pl.date_range(start=date(2024, 9, 1), end=date(2024, 9, 12), interval='1d', eager=True),
'reserved_before': [0, 1, 2, None, None, 1, 2, 3, None, 3, 4, 5]
})
df = df.with_columns(
(df['reserved_before'].is_not_null()).alias('reserved')
)
df = df.with_columns(
(df['reserved'] != df['reserved'].shift(1)).cumsum().alias('group')
)
min_max_df = (
df.groupby('group')
.agg(
pl.col('reserved_before').min().alias('block_min'),
pl.col('reserved_before').max().alias('block_max')
)
)
df = df.join(min_max_df, on='group', how='left')
df = df.drop('group', 'reserved')
Resulting error:
AttributeError: 'Series' object has no attribute 'cumsum'
Here's one approach:
import polars as pl
from datetime import date
pl_df = pl.DataFrame({
'date': pl.date_range(start=date(2024, 9, 1), end=date(2024, 9, 12),
interval='1d', eager=True),
'reserved_before': [0, 1, 2, None, None, 1, 2, 3, None, 3, 4, 5]
})
groups = pl.col('reserved_before').is_not_null().rle_id()
pl_df = pl_df.with_columns(
pl.col('reserved_before').min().over(groups).alias('block_min'),
pl.col('reserved_before').max().over(groups).alias('block_max'),
)
Explanation
pl.Expr.is_not_null
and pl.Expr.rle_id
to define your groups. IDs will start with 0, not with 1 (as with OP's use of pd.Series.cumsum
); irrelevant difference for the purpose.pl.Expr.over
to add pl.Expr.min
and max
.Equality check original pandas
method:
pl_df.to_pandas().equals(df.astype(pl_df.to_pandas().dtypes))
# True
Update for a slightly different logic
Instead of: a group ends where the next value is None
We use: a group ends where the next value is None
or where the next value is smaller
pl_df = pl.DataFrame({
'date': pl.date_range(start=date(2024, 9, 1), end=date(2024, 9, 12),
interval='1d', eager=True),
'reserved_before': [0, 1, 2, None, 4, 1, 2, 10, None, 3, 4, 5]
})
# desired groups:
# [[0, 1, 2], [4], [1, 2, 10], [3, 4, 5]]
To get groups
here, we can use pl.Expr.diff
+ pl.Expr.fill_null
, check < 0
, and apply pl.Expr.cum_sum
to the result.
groups = (pl.col('reserved_before').diff().fill_null(-1) < 0).cum_sum()
Adjusted output:
shape: (12, 4)
┌────────────┬─────────────────┬───────────┬───────────┐
│ date ┆ reserved_before ┆ block_min ┆ block_max │
│ --- ┆ --- ┆ --- ┆ --- │
│ date ┆ i64 ┆ i64 ┆ i64 │
╞════════════╪═════════════════╪═══════════╪═══════════╡
│ 2024-09-01 ┆ 0 ┆ 0 ┆ 2 │
│ 2024-09-02 ┆ 1 ┆ 0 ┆ 2 │
│ 2024-09-03 ┆ 2 ┆ 0 ┆ 2 │
│ 2024-09-04 ┆ null ┆ null ┆ null │
│ 2024-09-05 ┆ 4 ┆ 4 ┆ 4 │
│ 2024-09-06 ┆ 1 ┆ 1 ┆ 10 │
│ 2024-09-07 ┆ 2 ┆ 1 ┆ 10 │
│ 2024-09-08 ┆ 10 ┆ 1 ┆ 10 │
│ 2024-09-09 ┆ null ┆ null ┆ null │
│ 2024-09-10 ┆ 3 ┆ 3 ┆ 5 │
│ 2024-09-11 ┆ 4 ┆ 3 ┆ 5 │
│ 2024-09-12 ┆ 5 ┆ 3 ┆ 5 │
└────────────┴─────────────────┴───────────┴───────────┘