Search code examples
pythonpython-polars

Dynamically populating column in Polars


I am trying to translate a function I use to calculate skew over each feature of a dataframe. I calculate 2 skews, one for the feature when date == d, and one over the data when date <= d. I was able to translate the first line of the following code but not the second. The code to recreate a dataframe :

import random
date = [random.choice([1,2,3]) for x in range(0,100)]
feature1 = [random.gauss(0,1) for x in range(0,100)]
feature2 = [random.gauss(0,2) for x in range(0,100)]
df = pd.DataFrame({"date":date,'feature1':feature1,'feature2':feature2})

Here is the pandas function:

def features_augment(df):
    dff = df.copy()
    for col,d in itertools.product(dff.columns[2:],dff.date.unique()):
        dff.loc[dff.date==d,'sk_'+col] = scipy.stats.skew(dff.loc[dff.date==d,col]) #translated in polars
        dff.loc[dff.date==d,'rsk_'+col] = scipy.stats.skew(dff.loc[dff.date<=d,col]) #couldn't translate
    return dff

Here is my polars function so far:

def pl_feature_augment(df):
    pl_df = pl.from_pandas(df)
    sk = pl_df.groupby("date").agg(pl.all().exclude("id","volvol").skew().prefix("sk_")) #calculate skew on for each feature on each date 
    pl_df = pl_df.join(sk,"date")
    for d in pl_df.select(pl.col("date")).unique():
        pl_df.with_columns(pl.when(pl.col("date")<=d).then(pl.all().exclude("id","vol","volvol").skew()).prefix("rsk_")) #doesn't work 

return pl_df.to_pandas()

I managed to avoid the for loop for the first expression but can't figure out how to translate the second one even with a for loop...


Solution

  • Adding an extra feature column for example purposes:

    pl_df = pl.from_repr("""
    ┌──────┬───────────┬───────────┬───────────┐
    │ date ┆ feature1  ┆ feature2  ┆ feature3  │
    │ ---  ┆ ---       ┆ ---       ┆ ---       │
    │ i64  ┆ f64       ┆ f64       ┆ f64       │
    ╞══════╪═══════════╪═══════════╪═══════════╡
    │ 3    ┆ -0.71728  ┆ 0.149132  ┆ 1.001409  │
    │ 1    ┆ 1.303821  ┆ -1.621259 ┆ -0.261638 │
    │ 1    ┆ -0.180873 ┆ -0.777132 ┆ 0.967881  │
    │ 1    ┆ -0.438287 ┆ 0.304117  ┆ 0.736307  │
    │ 2    ┆ -0.489215 ┆ -1.239215 ┆ 2.767833  │
    │ 3    ┆ 0.209194  ┆ -1.106649 ┆ -0.73973  │
    │ 1    ┆ 0.516778  ┆ 0.26425   ┆ 2.80841   │
    │ 1    ┆ 0.482522  ┆ 1.146356  ┆ -0.752033 │
    │ 3    ┆ -0.760755 ┆ 0.211106  ┆ 0.00278   │
    │ 2    ┆ 0.665081  ┆ -1.140646 ┆ 3.009091  │
    │ 3    ┆ 0.027294  ┆ 1.385364  ┆ 1.848935  │
    │ 1    ┆ -0.109353 ┆ 2.698282  ┆ -1.080152 │
    │ 3    ┆ -0.50849  ┆ -3.916603 ┆ -4.422061 │
    │ 1    ┆ -1.273324 ┆ -0.0369   ┆ -3.589582 │
    │ 2    ┆ -0.926717 ┆ -2.338863 ┆ 1.815631  │
    │ 1    ┆ -0.906546 ┆ 1.224383  ┆ 0.145775  │
    │ 3    ┆ -0.459142 ┆ 1.01782   ┆ -1.900232 │
    │ 1    ┆ 0.371648  ┆ 3.509538  ┆ -0.654882 │
    │ 2    ┆ 0.958224  ┆ -2.368278 ┆ 0.618381  │
    │ 1    ┆ -0.235959 ┆ 1.326076  ┆ 5.704977  │
    └──────┴───────────┴───────────┴───────────┘
    """)
    

    I'm not sure if it's possible to do this in a single pass using existing groupby/join methods.

    (It seems like it can be expressed as a non-equi join which are on the TODO list.)

    We can build a sequence of LazyFrames which allows Polars to run the computations in parallel:

    pl_df = pl.from_pandas(df).lazy()
    
    dates = pl_df.select("date").unique().collect().iter_rows()
    
    skews = (
       pl_df.filter(pl.col("date") <= date)
            .select(
               pl.lit(date, int).alias("date"),
               pl.exclude("date", "id", "volvol").skew().prefix("rsk_")
            )
       for date in dates
    )
    
    (pl_df
       .with_columns(
          pl.exclude("date", "id", "volvol").skew().over("date").prefix("sk_")
       )
       .join(
          pl.concat(skews), 
          on = "date", 
          how = "left"
       )
       .collect()
    )
    
    shape: (20, 10)
    ┌──────┬───────────┬───────────┬───────────┬─────────────┬─────────────┬─────────────┬──────────────┬──────────────┬──────────────┐
    │ date ┆ feature1  ┆ feature2  ┆ feature3  ┆ sk_feature1 ┆ sk_feature2 ┆ sk_feature3 ┆ rsk_feature1 ┆ rsk_feature2 ┆ rsk_feature3 │
    │ ---  ┆ ---       ┆ ---       ┆ ---       ┆ ---         ┆ ---         ┆ ---         ┆ ---          ┆ ---          ┆ ---          │
    │ i64  ┆ f64       ┆ f64       ┆ f64       ┆ f64         ┆ f64         ┆ f64         ┆ f64          ┆ f64          ┆ f64          │
    ╞══════╪═══════════╪═══════════╪═══════════╪═════════════╪═════════════╪═════════════╪══════════════╪══════════════╪══════════════╡
    │ 3    ┆ -0.71728  ┆ 0.149132  ┆ 1.001409  ┆ 0.511999    ┆ -1.094016   ┆ -0.607669   ┆ 0.366071     ┆ -0.106414    ┆ 0.019972     │
    │ 1    ┆ 1.303821  ┆ -1.621259 ┆ -0.261638 ┆ 0.088205    ┆ 0.25369     ┆ 0.715095    ┆ 0.088205     ┆ 0.25369      ┆ 0.715095     │
    │ 1    ┆ -0.180873 ┆ -0.777132 ┆ 0.967881  ┆ 0.088205    ┆ 0.25369     ┆ 0.715095    ┆ 0.088205     ┆ 0.25369      ┆ 0.715095     │
    │ 1    ┆ -0.438287 ┆ 0.304117  ┆ 0.736307  ┆ 0.088205    ┆ 0.25369     ┆ 0.715095    ┆ 0.088205     ┆ 0.25369      ┆ 0.715095     │
    │ 2    ┆ -0.489215 ┆ -1.239215 ┆ 2.767833  ┆ -0.062774   ┆ 0.009747    ┆ -0.517757   ┆ 0.052201     ┆ 0.364357     ┆ 0.214142     │
    │ 3    ┆ 0.209194  ┆ -1.106649 ┆ -0.73973  ┆ 0.511999    ┆ -1.094016   ┆ -0.607669   ┆ 0.366071     ┆ -0.106414    ┆ 0.019972     │
    │ 1    ┆ 0.516778  ┆ 0.26425   ┆ 2.80841   ┆ 0.088205    ┆ 0.25369     ┆ 0.715095    ┆ 0.088205     ┆ 0.25369      ┆ 0.715095     │
    │ 1    ┆ 0.482522  ┆ 1.146356  ┆ -0.752033 ┆ 0.088205    ┆ 0.25369     ┆ 0.715095    ┆ 0.088205     ┆ 0.25369      ┆ 0.715095     │
    │ 3    ┆ -0.760755 ┆ 0.211106  ┆ 0.00278   ┆ 0.511999    ┆ -1.094016   ┆ -0.607669   ┆ 0.366071     ┆ -0.106414    ┆ 0.019972     │
    │ 2    ┆ 0.665081  ┆ -1.140646 ┆ 3.009091  ┆ -0.062774   ┆ 0.009747    ┆ -0.517757   ┆ 0.052201     ┆ 0.364357     ┆ 0.214142     │
    │ 3    ┆ 0.027294  ┆ 1.385364  ┆ 1.848935  ┆ 0.511999    ┆ -1.094016   ┆ -0.607669   ┆ 0.366071     ┆ -0.106414    ┆ 0.019972     │
    │ 1    ┆ -0.109353 ┆ 2.698282  ┆ -1.080152 ┆ 0.088205    ┆ 0.25369     ┆ 0.715095    ┆ 0.088205     ┆ 0.25369      ┆ 0.715095     │
    │ 3    ┆ -0.50849  ┆ -3.916603 ┆ -4.422061 ┆ 0.511999    ┆ -1.094016   ┆ -0.607669   ┆ 0.366071     ┆ -0.106414    ┆ 0.019972     │
    │ 1    ┆ -1.273324 ┆ -0.0369   ┆ -3.589582 ┆ 0.088205    ┆ 0.25369     ┆ 0.715095    ┆ 0.088205     ┆ 0.25369      ┆ 0.715095     │
    │ 2    ┆ -0.926717 ┆ -2.338863 ┆ 1.815631  ┆ -0.062774   ┆ 0.009747    ┆ -0.517757   ┆ 0.052201     ┆ 0.364357     ┆ 0.214142     │
    │ 1    ┆ -0.906546 ┆ 1.224383  ┆ 0.145775  ┆ 0.088205    ┆ 0.25369     ┆ 0.715095    ┆ 0.088205     ┆ 0.25369      ┆ 0.715095     │
    │ 3    ┆ -0.459142 ┆ 1.01782   ┆ -1.900232 ┆ 0.511999    ┆ -1.094016   ┆ -0.607669   ┆ 0.366071     ┆ -0.106414    ┆ 0.019972     │
    │ 1    ┆ 0.371648  ┆ 3.509538  ┆ -0.654882 ┆ 0.088205    ┆ 0.25369     ┆ 0.715095    ┆ 0.088205     ┆ 0.25369      ┆ 0.715095     │
    │ 2    ┆ 0.958224  ┆ -2.368278 ┆ 0.618381  ┆ -0.062774   ┆ 0.009747    ┆ -0.517757   ┆ 0.052201     ┆ 0.364357     ┆ 0.214142     │
    │ 1    ┆ -0.235959 ┆ 1.326076  ┆ 5.704977  ┆ 0.088205    ┆ 0.25369     ┆ 0.715095    ┆ 0.088205     ┆ 0.25369      ┆ 0.715095     │
    └──────┴───────────┴───────────┴───────────┴─────────────┴─────────────┴─────────────┴──────────────┴──────────────┴──────────────┘
    

    As we're using the Lazy API here, pl.concat(skews) is parallelized.