Transforming the data using polars: Have difficulty vis-a-vis pandas

I work in an edtech firm and need to generate a report everyweek comparing one week with the week prior to it. I am trying to port my code to polars.

My final output is a dict of following form:

{ "Admission/Retention": { "Category": { "1": { "X": [ "2023-07-01", "2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07" ], "Y": [ [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.5, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0], [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] ] }, "2": { "X": [ "2023-07-01", "2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07" ], "Y": [ [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] ] } }, "Labels": ["Converted", "Converted_lastweek", "Converted_beginn_lastweek"] }

I am stuck in polars. Below is where I am presently stuck:

import polars as pl
import numpy as np
from datetime import date, datetime, timedelta

date = [
    "2023-06-24", "2023-06-25", "2023-06-26", "2023-06-27", "2023-06-28", "2023-06-29",
    "2023-06-30", "2023-07-01", "2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05",
    "2023-07-06", "2023-07-07", "2023-06-24", "2023-06-25", "2023-06-26", "2023-06-27",
    "2023-06-28", "2023-06-29", "2023-06-30", "2023-07-01", "2023-07-02", "2023-07-03",
    "2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07", "2023-06-24", "2023-06-25",
    "2023-06-26", "2023-06-27", "2023-06-28", "2023-06-29", "2023-06-30", "2023-07-01",
    "2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07"
]

categories = [1] * 14 + [1] * 14 + [2] * 14
students = ["A"] * 14 + ["B"] * 14 + ["B"] * 14
fakes = [0] * 42
actives = [1] * 42

conversion_dates = ["6/23/2023", None, None, None, None, None, None, None, None, None, None, None, None, None,
                    None, None, None, None, "6/26/2023", None, None, None, None, None, None, None, None, None, None,
                    None, None, None, None, None, None, None, None, None, None, "7/3/2023", None, None]

converted = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]

df = pl.DataFrame({
    'date': date,
    'category': categories,
    'student': students,
    'fake': fakes,
    'active_student': actives,
    'conversion_date': conversion_dates,
    'converted': converted
})

def kpi_tracker(df):

    start_date = date(2023, 7, 1)
    dates = [start_date + timedelta(days=x) for x in range(0, 7)]
    past_week_start = start_date - timedelta(days=7)
    dates_last_week = [start_date + timedelta(days=x) for x in range(-7, 7)]

    bookings = np.zeros((1, len(dates)))
    bookings_lastweek = np.zeros((1, len(dates)))
    bookings_lastweek_beginning = np.zeros((1, len(dates)))

    mask = ((pl.col('converted') == 1) & (pl.col('conversion_date') < past_week_start))

    df = df.with_columns(
           pl.when(mask)
             .then(1)
             .alias('converted_stlw')
       )

#    temp = df[['date', 'converted', 'converted_stlw']][(df['active_student'])&(df['fake']==0)].groupby(['date']).mean()

#    temp = temp.reindex(dates_last_week)
#    temp['converted_stlw_shifted'] = temp['converted_stlw'].shift((7))
#    temp['converted_last_week'] = temp['converted'].shift((7))

#    temp = temp.reindex(dates)
#    converted = temp['converted'].fillna(0).tolist()
#    converted_lastweek = temp['converted_last_week'].fillna(0).tolist()
#    converted_beginn_lastweek = temp['converted_stlw_shifted'].fillna(0).tolist()

    result = {'X':dates,'Converted':converted, 'Converted_lastweek': converted_lastweek, 'Converted_beginn_lastweek': converted_beginn_lastweek}    
    return df

df_dataframe = pl.read_csv('data.csv')
df_dataframe = df_dataframe.with_columns(pl.col('conversion_date').str.strptime(pl.Date(), "%m/%d/%Y"))
df = df_dataframe.groupby('Category').apply(kpi_tracker)

Solution

Hopefully I've understood correctly.

It looks like you're doing:

start_date = pl.date(2023, 7, 1)

past_week_start = start_date.dt.offset_by('-7d')
this_week_end = start_date.dt.offset_by('7d')

past_week_dates = (
   pl.col('date').is_between(
      past_week_start, this_week_end, closed='left'
   )
)

this_week_dates = (
   pl.col('date').is_between(
      start_date, this_week_end, closed='left'
   )
)

converted = pl.all(
   pl.col('converted') == 1, 
   pl.col('conversion_date') < past_week_start
)

active_student = pl.all(
   pl.col('active_student') == 1,
   pl.col('fake') == 0
)

groups = (
   df.with_columns(
      pl.when(converted).then(1).alias('converted_stlw'),
   )
   .filter(
      past_week_dates
   )
   .with_columns(
      pl.col('converted', 'converted_stlw')
        .filter(active_student)
        .mean()
        .over('date', 'category')
   )
   .with_columns(
      pl.col('converted', 'converted_stlw')
        .shift(7)
        .over('category')
        .suffix('_shift')
   )
   .filter(
       this_week_dates
   )
   .groupby('category')
)

groups.head(7)

shape: (14, 10)
┌──────────┬────────────┬─────────┬──────┬───┬───────────┬────────────────┬─────────────────┬──────────────────────┐
│ category ┆ date       ┆ student ┆ fake ┆ … ┆ converted ┆ converted_stlw ┆ converted_shift ┆ converted_stlw_shift │
│ ---      ┆ ---        ┆ ---     ┆ ---  ┆   ┆ ---       ┆ ---            ┆ ---             ┆ ---                  │
│ i64      ┆ date       ┆ str     ┆ i64  ┆   ┆ f64       ┆ f64            ┆ f64             ┆ f64                  │
╞══════════╪════════════╪═════════╪══════╪═══╪═══════════╪════════════════╪═════════════════╪══════════════════════╡
│ 2        ┆ 2023-07-01 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
│ 2        ┆ 2023-07-02 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
│ 2        ┆ 2023-07-03 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
│ 2        ┆ 2023-07-04 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
│ 2        ┆ 2023-07-05 ┆ B       ┆ 0    ┆ … ┆ 1.0       ┆ null           ┆ 0.0             ┆ null                 │
│ 2        ┆ 2023-07-06 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
│ 2        ┆ 2023-07-07 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
│ 1        ┆ 2023-07-01 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.5             ┆ 1.0                  │
│ 1        ┆ 2023-07-02 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
│ 1        ┆ 2023-07-03 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
│ 1        ┆ 2023-07-04 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
│ 1        ┆ 2023-07-05 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
│ 1        ┆ 2023-07-06 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.5             ┆ null                 │
│ 1        ┆ 2023-07-07 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
└──────────┴────────────┴─────────┴──────┴───┴───────────┴────────────────┴─────────────────┴──────────────────────┘

As for the dict output, you could perhaps use .partition_by() or iterate over the groups and use .to_dict()

for category, group in groups:
   values = (
      group.head(7)
        .select(pl.col('date').dt.to_string('%Y-%m-%d'), 'converted_shift', 'converted_stlw_shift')
        .to_dict(as_series=False)
   )
   { category: values }

{1: {'date': ['2023-07-01',
   '2023-07-02',
   '2023-07-03',
   '2023-07-04',
   '2023-07-05',
   '2023-07-06',
   '2023-07-07'],
  'converted_shift': [0.5, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0],
  'converted_stlw_shift': [1.0, None, None, None, None, None, None]}}
{2: {'date': ['2023-07-01',
   '2023-07-02',
   '2023-07-03',
   '2023-07-04',
   '2023-07-05',
   '2023-07-06',
   '2023-07-07'],
  'converted_shift': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  'converted_stlw_shift': [None, None, None, None, None, None, None]}}

You could modify the dict to fit your exact needs.