Search code examples
python-polars

Transforming the data using polars: Have difficulty vis-a-vis pandas


I work in an edtech firm and need to generate a report everyweek comparing one week with the week prior to it. I am trying to port my code to polars.

I work in an edtech firm and need to generate a report everyweek comparing one week with the week prior to it. I am trying to port my code to polars.

My final output is a dict of following form:

{ "Admission/Retention": { "Category": { "1": { "X": [ "2023-07-01", "2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07" ], "Y": [ [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.5, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0], [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] ] }, "2": { "X": [ "2023-07-01", "2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07" ], "Y": [ [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] ] } }, "Labels": ["Converted", "Converted_lastweek", "Converted_beginn_lastweek"] }

I am stuck in polars. Below is where I am presently stuck:

import polars as pl
import numpy as np
from datetime import date, datetime, timedelta

date = [
    "2023-06-24", "2023-06-25", "2023-06-26", "2023-06-27", "2023-06-28", "2023-06-29",
    "2023-06-30", "2023-07-01", "2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05",
    "2023-07-06", "2023-07-07", "2023-06-24", "2023-06-25", "2023-06-26", "2023-06-27",
    "2023-06-28", "2023-06-29", "2023-06-30", "2023-07-01", "2023-07-02", "2023-07-03",
    "2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07", "2023-06-24", "2023-06-25",
    "2023-06-26", "2023-06-27", "2023-06-28", "2023-06-29", "2023-06-30", "2023-07-01",
    "2023-07-02", "2023-07-03", "2023-07-04", "2023-07-05", "2023-07-06", "2023-07-07"
]

categories = [1] * 14 + [1] * 14 + [2] * 14
students = ["A"] * 14 + ["B"] * 14 + ["B"] * 14
fakes = [0] * 42
actives = [1] * 42

conversion_dates = ["6/23/2023", None, None, None, None, None, None, None, None, None, None, None, None, None,
                    None, None, None, None, "6/26/2023", None, None, None, None, None, None, None, None, None, None,
                    None, None, None, None, None, None, None, None, None, None, "7/3/2023", None, None]

converted = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]

df = pl.DataFrame({
    'date': date,
    'category': categories,
    'student': students,
    'fake': fakes,
    'active_student': actives,
    'conversion_date': conversion_dates,
    'converted': converted
})

def kpi_tracker(df):

    start_date = date(2023, 7, 1)
    dates = [start_date + timedelta(days=x) for x in range(0, 7)]
    past_week_start = start_date - timedelta(days=7)
    dates_last_week = [start_date + timedelta(days=x) for x in range(-7, 7)]

    bookings = np.zeros((1, len(dates)))
    bookings_lastweek = np.zeros((1, len(dates)))
    bookings_lastweek_beginning = np.zeros((1, len(dates)))

    mask = ((pl.col('converted') == 1) & (pl.col('conversion_date') < past_week_start))

    df = df.with_columns(
           pl.when(mask)
             .then(1)
             .alias('converted_stlw')
       )

#    temp = df[['date', 'converted', 'converted_stlw']][(df['active_student'])&(df['fake']==0)].groupby(['date']).mean()

#    temp = temp.reindex(dates_last_week)
#    temp['converted_stlw_shifted'] = temp['converted_stlw'].shift((7))
#    temp['converted_last_week'] = temp['converted'].shift((7))

#    temp = temp.reindex(dates)
#    converted = temp['converted'].fillna(0).tolist()
#    converted_lastweek = temp['converted_last_week'].fillna(0).tolist()
#    converted_beginn_lastweek = temp['converted_stlw_shifted'].fillna(0).tolist()

    result = {'X':dates,'Converted':converted, 'Converted_lastweek': converted_lastweek, 'Converted_beginn_lastweek': converted_beginn_lastweek}    
    return df

df_dataframe = pl.read_csv('data.csv')
df_dataframe = df_dataframe.with_columns(pl.col('conversion_date').str.strptime(pl.Date(), "%m/%d/%Y"))
df = df_dataframe.groupby('Category').apply(kpi_tracker)

Solution

  • Hopefully I've understood correctly.

    It looks like you're doing:

    start_date = pl.date(2023, 7, 1)
    
    past_week_start = start_date.dt.offset_by('-7d')
    this_week_end = start_date.dt.offset_by('7d')
    
    past_week_dates = (
       pl.col('date').is_between(
          past_week_start, this_week_end, closed='left'
       )
    )
    
    this_week_dates = (
       pl.col('date').is_between(
          start_date, this_week_end, closed='left'
       )
    )
    
    converted = pl.all(
       pl.col('converted') == 1, 
       pl.col('conversion_date') < past_week_start
    )
    
    active_student = pl.all(
       pl.col('active_student') == 1,
       pl.col('fake') == 0
    )
    
    groups = (
       df.with_columns(
          pl.when(converted).then(1).alias('converted_stlw'),
       )
       .filter(
          past_week_dates
       )
       .with_columns(
          pl.col('converted', 'converted_stlw')
            .filter(active_student)
            .mean()
            .over('date', 'category')
       )
       .with_columns(
          pl.col('converted', 'converted_stlw')
            .shift(7)
            .over('category')
            .suffix('_shift')
       )
       .filter(
           this_week_dates
       )
       .groupby('category')
    )
    
    groups.head(7)
    
    shape: (14, 10)
    ┌──────────┬────────────┬─────────┬──────┬───┬───────────┬────────────────┬─────────────────┬──────────────────────┐
    │ category ┆ date       ┆ student ┆ fake ┆ … ┆ converted ┆ converted_stlw ┆ converted_shift ┆ converted_stlw_shift │
    │ ---      ┆ ---        ┆ ---     ┆ ---  ┆   ┆ ---       ┆ ---            ┆ ---             ┆ ---                  │
    │ i64      ┆ date       ┆ str     ┆ i64  ┆   ┆ f64       ┆ f64            ┆ f64             ┆ f64                  │
    ╞══════════╪════════════╪═════════╪══════╪═══╪═══════════╪════════════════╪═════════════════╪══════════════════════╡
    │ 2        ┆ 2023-07-01 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
    │ 2        ┆ 2023-07-02 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
    │ 2        ┆ 2023-07-03 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
    │ 2        ┆ 2023-07-04 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
    │ 2        ┆ 2023-07-05 ┆ B       ┆ 0    ┆ … ┆ 1.0       ┆ null           ┆ 0.0             ┆ null                 │
    │ 2        ┆ 2023-07-06 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
    │ 2        ┆ 2023-07-07 ┆ B       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
    │ 1        ┆ 2023-07-01 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.5             ┆ 1.0                  │
    │ 1        ┆ 2023-07-02 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
    │ 1        ┆ 2023-07-03 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
    │ 1        ┆ 2023-07-04 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
    │ 1        ┆ 2023-07-05 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
    │ 1        ┆ 2023-07-06 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.5             ┆ null                 │
    │ 1        ┆ 2023-07-07 ┆ A       ┆ 0    ┆ … ┆ 0.0       ┆ null           ┆ 0.0             ┆ null                 │
    └──────────┴────────────┴─────────┴──────┴───┴───────────┴────────────────┴─────────────────┴──────────────────────┘
    

    As for the dict output, you could perhaps use .partition_by() or iterate over the groups and use .to_dict()

    for category, group in groups:
       values = (
          group.head(7)
            .select(pl.col('date').dt.to_string('%Y-%m-%d'), 'converted_shift', 'converted_stlw_shift')
            .to_dict(as_series=False)
       )
       { category: values }
    
    {1: {'date': ['2023-07-01',
       '2023-07-02',
       '2023-07-03',
       '2023-07-04',
       '2023-07-05',
       '2023-07-06',
       '2023-07-07'],
      'converted_shift': [0.5, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0],
      'converted_stlw_shift': [1.0, None, None, None, None, None, None]}}
    {2: {'date': ['2023-07-01',
       '2023-07-02',
       '2023-07-03',
       '2023-07-04',
       '2023-07-05',
       '2023-07-06',
       '2023-07-07'],
      'converted_shift': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
      'converted_stlw_shift': [None, None, None, None, None, None, None]}}
    

    You could modify the dict to fit your exact needs.