I work in an edtech firm and need to generate a report everyweek comparing one week with the week prior to it. I am trying to port my code to polars.
I work in an edtech firm and need to generate a report everyweek comparing one week with the week prior to it. I am trying to port my code to polars.
My final output is a dict of following form:
{ "Admission/Retention": { "Category": { "1": { "X": [ "2023-07-01", "2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07" ], "Y": [ [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.5, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0], [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] ] }, "2": { "X": [ "2023-07-01", "2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07" ], "Y": [ [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] ] } }, "Labels": ["Converted", "Converted_lastweek", "Converted_beginn_lastweek"] }
I am stuck in polars. Below is where I am presently stuck:
import polars as pl
import numpy as np
from datetime import date, datetime, timedelta
date = [
"2023-06-24", "2023-06-25", "2023-06-26", "2023-06-27", "2023-06-28", "2023-06-29",
"2023-06-30", "2023-07-01", "2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05",
"2023-07-06", "2023-07-07", "2023-06-24", "2023-06-25", "2023-06-26", "2023-06-27",
"2023-06-28", "2023-06-29", "2023-06-30", "2023-07-01", "2023-07-02", "2023-07-03",
"2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07", "2023-06-24", "2023-06-25",
"2023-06-26", "2023-06-27", "2023-06-28", "2023-06-29", "2023-06-30", "2023-07-01",
"2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07"
]
categories = [1] * 14 + [1] * 14 + [2] * 14
students = ["A"] * 14 + ["B"] * 14 + ["B"] * 14
fakes = [0] * 42
actives = [1] * 42
conversion_dates = ["6/23/2023", None, None, None, None, None, None, None, None, None, None, None, None, None,
None, None, None, None, "6/26/2023", None, None, None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None, None, None, "7/3/2023", None, None]
converted = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
df = pl.DataFrame({
'date': date,
'category': categories,
'student': students,
'fake': fakes,
'active_student': actives,
'conversion_date': conversion_dates,
'converted': converted
})
def kpi_tracker(df):
start_date = date(2023, 7, 1)
dates = [start_date + timedelta(days=x) for x in range(0, 7)]
past_week_start = start_date - timedelta(days=7)
dates_last_week = [start_date + timedelta(days=x) for x in range(-7, 7)]
bookings = np.zeros((1, len(dates)))
bookings_lastweek = np.zeros((1, len(dates)))
bookings_lastweek_beginning = np.zeros((1, len(dates)))
mask = ((pl.col('converted') == 1) & (pl.col('conversion_date') < past_week_start))
df = df.with_columns(
pl.when(mask)
.then(1)
.alias('converted_stlw')
)
# temp = df[['date', 'converted', 'converted_stlw']][(df['active_student'])&(df['fake']==0)].groupby(['date']).mean()
# temp = temp.reindex(dates_last_week)
# temp['converted_stlw_shifted'] = temp['converted_stlw'].shift((7))
# temp['converted_last_week'] = temp['converted'].shift((7))
# temp = temp.reindex(dates)
# converted = temp['converted'].fillna(0).tolist()
# converted_lastweek = temp['converted_last_week'].fillna(0).tolist()
# converted_beginn_lastweek = temp['converted_stlw_shifted'].fillna(0).tolist()
result = {'X':dates,'Converted':converted, 'Converted_lastweek': converted_lastweek, 'Converted_beginn_lastweek': converted_beginn_lastweek}
return df
df_dataframe = pl.read_csv('data.csv')
df_dataframe = df_dataframe.with_columns(pl.col('conversion_date').str.strptime(pl.Date(), "%m/%d/%Y"))
df = df_dataframe.groupby('Category').apply(kpi_tracker)
Hopefully I've understood correctly.
It looks like you're doing:
start_date = pl.date(2023, 7, 1)
past_week_start = start_date.dt.offset_by('-7d')
this_week_end = start_date.dt.offset_by('7d')
past_week_dates = (
pl.col('date').is_between(
past_week_start, this_week_end, closed='left'
)
)
this_week_dates = (
pl.col('date').is_between(
start_date, this_week_end, closed='left'
)
)
converted = pl.all(
pl.col('converted') == 1,
pl.col('conversion_date') < past_week_start
)
active_student = pl.all(
pl.col('active_student') == 1,
pl.col('fake') == 0
)
groups = (
df.with_columns(
pl.when(converted).then(1).alias('converted_stlw'),
)
.filter(
past_week_dates
)
.with_columns(
pl.col('converted', 'converted_stlw')
.filter(active_student)
.mean()
.over('date', 'category')
)
.with_columns(
pl.col('converted', 'converted_stlw')
.shift(7)
.over('category')
.suffix('_shift')
)
.filter(
this_week_dates
)
.groupby('category')
)
groups.head(7)
shape: (14, 10)
┌──────────┬────────────┬─────────┬──────┬───┬───────────┬────────────────┬─────────────────┬──────────────────────┐
│ category ┆ date ┆ student ┆ fake ┆ … ┆ converted ┆ converted_stlw ┆ converted_shift ┆ converted_stlw_shift │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ date ┆ str ┆ i64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════════╪════════════╪═════════╪══════╪═══╪═══════════╪════════════════╪═════════════════╪══════════════════════╡
│ 2 ┆ 2023-07-01 ┆ B ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.0 ┆ null │
│ 2 ┆ 2023-07-02 ┆ B ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.0 ┆ null │
│ 2 ┆ 2023-07-03 ┆ B ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.0 ┆ null │
│ 2 ┆ 2023-07-04 ┆ B ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.0 ┆ null │
│ 2 ┆ 2023-07-05 ┆ B ┆ 0 ┆ … ┆ 1.0 ┆ null ┆ 0.0 ┆ null │
│ 2 ┆ 2023-07-06 ┆ B ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.0 ┆ null │
│ 2 ┆ 2023-07-07 ┆ B ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.0 ┆ null │
│ 1 ┆ 2023-07-01 ┆ A ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.5 ┆ 1.0 │
│ 1 ┆ 2023-07-02 ┆ A ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.0 ┆ null │
│ 1 ┆ 2023-07-03 ┆ A ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.0 ┆ null │
│ 1 ┆ 2023-07-04 ┆ A ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.0 ┆ null │
│ 1 ┆ 2023-07-05 ┆ A ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.0 ┆ null │
│ 1 ┆ 2023-07-06 ┆ A ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.5 ┆ null │
│ 1 ┆ 2023-07-07 ┆ A ┆ 0 ┆ … ┆ 0.0 ┆ null ┆ 0.0 ┆ null │
└──────────┴────────────┴─────────┴──────┴───┴───────────┴────────────────┴─────────────────┴──────────────────────┘
As for the dict output, you could perhaps use .partition_by()
or iterate over the groups and use .to_dict()
for category, group in groups:
values = (
group.head(7)
.select(pl.col('date').dt.to_string('%Y-%m-%d'), 'converted_shift', 'converted_stlw_shift')
.to_dict(as_series=False)
)
{ category: values }
{1: {'date': ['2023-07-01',
'2023-07-02',
'2023-07-03',
'2023-07-04',
'2023-07-05',
'2023-07-06',
'2023-07-07'],
'converted_shift': [0.5, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0],
'converted_stlw_shift': [1.0, None, None, None, None, None, None]}}
{2: {'date': ['2023-07-01',
'2023-07-02',
'2023-07-03',
'2023-07-04',
'2023-07-05',
'2023-07-06',
'2023-07-07'],
'converted_shift': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
'converted_stlw_shift': [None, None, None, None, None, None, None]}}
You could modify the dict to fit your exact needs.