Search code examples
pythonpandaspython-polars

Polars: cumsum alternatives


I have below pandas snippet which I want to convert to polars to try, Expected output for polars is same as pandas but failing as cumsum is missing, how to achieve similar output?:

import pandas as pd
import numpy as np

data = {
    'date': pd.date_range(start='2024-09-01', periods=12),
    'reserved_before': [0, 1, 2, np.nan, np.nan, 1, 2, 3, np.nan, 3, 4, 5]
}
df = pd.DataFrame(data)
df['reserved'] = df['reserved_before'].notna()

def assign_group_id(series):
    return (series != series.shift()).cumsum()

df['group'] = assign_group_id(df['reserved'])

def min_max(group):
    non_nan = group['reserved_before'].dropna()
    if len(non_nan) > 0:
        return pd.Series({'min': non_nan.min(), 'max': non_nan.max()})
    return pd.Series({'min': np.nan, 'max': np.nan})

result = df.groupby('group').apply(min_max).reset_index()

df = df.merge(result, on='group', how='left')

df = df.drop(columns=['group', 'reserved'])
df = df.rename(columns={'min': 'block_min', 'max': 'block_max'})

Expected output for polars is same as pandas but failing as cumsum is missing, how to achieve similar output?:

import polars as pl
from datetime import date

df = pl.DataFrame({
    'date': pl.date_range(start=date(2024, 9, 1), end=date(2024, 9, 12), interval='1d', eager=True),
    'reserved_before': [0, 1, 2, None, None, 1, 2, 3, None, 3, 4, 5]
})

df = df.with_columns(
    (df['reserved_before'].is_not_null()).alias('reserved')
)

df = df.with_columns(
    (df['reserved'] != df['reserved'].shift(1)).cumsum().alias('group')
)

min_max_df = (
    df.groupby('group')
    .agg([
        pl.col('reserved_before').min().alias('block_min'),
        pl.col('reserved_before').max().alias('block_max')
    ])
)

df = df.join(min_max_df, on='group', how='left')
df = df.drop(['group', 'reserved'])

Solution

  • Here's one approach:

    import polars as pl
    from datetime import date
    
    pl_df = pl.DataFrame({
        'date': pl.date_range(start=date(2024, 9, 1), end=date(2024, 9, 12), 
                              interval='1d', eager=True),
        'reserved_before': [0, 1, 2, None, None, 1, 2, 3, None, 3, 4, 5]
    })
    
    groups = pl.col('reserved_before').is_not_null().rle_id()
    
    pl_df = pl_df.with_columns(
        pl.col('reserved_before').min().over(groups).alias('block_min'),
        pl.col('reserved_before').max().over(groups).alias('block_max'),
    )
    

    Explanation

    Equality check original pandas method:

    pl_df.to_pandas().equals(df.astype(pl_df.to_pandas().dtypes))
    # True
    

    Update for a slightly different logic

    Instead of: a group ends where the next value is None

    We use: a group ends where the next value is None or where the next value is smaller

    pl_df = pl.DataFrame({
        'date': pl.date_range(start=date(2024, 9, 1), end=date(2024, 9, 12), 
                              interval='1d', eager=True),
        'reserved_before': [0, 1, 2, None, 4, 1, 2, 10, None, 3, 4, 5]
    })
    
    # desired groups:
    # [[0, 1, 2], [4], [1, 2, 10], [3, 4, 5]]
    

    To get groups here, we can use pl.Expr.diff + pl.Expr.fill_null, check < 0, and apply pl.Expr.cum_sum to the result.

    groups = (pl.col('reserved_before').diff().fill_null(-1) < 0).cum_sum()
    

    Adjusted output:

    shape: (12, 4)
    ┌────────────┬─────────────────┬───────────┬───────────┐
    │ date       ┆ reserved_before ┆ block_min ┆ block_max │
    │ ---        ┆ ---             ┆ ---       ┆ ---       │
    │ date       ┆ i64             ┆ i64       ┆ i64       │
    ╞════════════╪═════════════════╪═══════════╪═══════════╡
    │ 2024-09-01 ┆ 0               ┆ 0         ┆ 2         │
    │ 2024-09-02 ┆ 1               ┆ 0         ┆ 2         │
    │ 2024-09-03 ┆ 2               ┆ 0         ┆ 2         │
    │ 2024-09-04 ┆ null            ┆ null      ┆ null      │
    │ 2024-09-05 ┆ 4               ┆ 4         ┆ 4         │
    │ 2024-09-06 ┆ 1               ┆ 1         ┆ 10        │
    │ 2024-09-07 ┆ 2               ┆ 1         ┆ 10        │
    │ 2024-09-08 ┆ 10              ┆ 1         ┆ 10        │
    │ 2024-09-09 ┆ null            ┆ null      ┆ null      │
    │ 2024-09-10 ┆ 3               ┆ 3         ┆ 5         │
    │ 2024-09-11 ┆ 4               ┆ 3         ┆ 5         │
    │ 2024-09-12 ┆ 5               ┆ 3         ┆ 5         │
    └────────────┴─────────────────┴───────────┴───────────┘