I am trying to translate a function I use to calculate skew over each feature of a dataframe.
I calculate 2 skews, one for the feature when date == d
, and one over the data when date <= d
.
I was able to translate the first line of the following code but not the second.
The code to recreate a dataframe :
import random
date = [random.choice([1,2,3]) for x in range(0,100)]
feature1 = [random.gauss(0,1) for x in range(0,100)]
feature2 = [random.gauss(0,2) for x in range(0,100)]
df = pd.DataFrame({"date":date,'feature1':feature1,'feature2':feature2})
Here is the pandas function:
def features_augment(df):
dff = df.copy()
for col,d in itertools.product(dff.columns[2:],dff.date.unique()):
dff.loc[dff.date==d,'sk_'+col] = scipy.stats.skew(dff.loc[dff.date==d,col]) #translated in polars
dff.loc[dff.date==d,'rsk_'+col] = scipy.stats.skew(dff.loc[dff.date<=d,col]) #couldn't translate
return dff
Here is my polars function so far:
def pl_feature_augment(df):
pl_df = pl.from_pandas(df)
sk = pl_df.groupby("date").agg(pl.all().exclude("id","volvol").skew().prefix("sk_")) #calculate skew on for each feature on each date
pl_df = pl_df.join(sk,"date")
for d in pl_df.select(pl.col("date")).unique():
pl_df.with_columns(pl.when(pl.col("date")<=d).then(pl.all().exclude("id","vol","volvol").skew()).prefix("rsk_")) #doesn't work
return pl_df.to_pandas()
I managed to avoid the for
loop for the first expression but can't figure out how to translate the second one even with a for
loop...
Adding an extra feature column for example purposes:
pl_df = pl.from_repr("""
┌──────┬───────────┬───────────┬───────────┐
│ date ┆ feature1 ┆ feature2 ┆ feature3 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪═══════════╪═══════════╪═══════════╡
│ 3 ┆ -0.71728 ┆ 0.149132 ┆ 1.001409 │
│ 1 ┆ 1.303821 ┆ -1.621259 ┆ -0.261638 │
│ 1 ┆ -0.180873 ┆ -0.777132 ┆ 0.967881 │
│ 1 ┆ -0.438287 ┆ 0.304117 ┆ 0.736307 │
│ 2 ┆ -0.489215 ┆ -1.239215 ┆ 2.767833 │
│ 3 ┆ 0.209194 ┆ -1.106649 ┆ -0.73973 │
│ 1 ┆ 0.516778 ┆ 0.26425 ┆ 2.80841 │
│ 1 ┆ 0.482522 ┆ 1.146356 ┆ -0.752033 │
│ 3 ┆ -0.760755 ┆ 0.211106 ┆ 0.00278 │
│ 2 ┆ 0.665081 ┆ -1.140646 ┆ 3.009091 │
│ 3 ┆ 0.027294 ┆ 1.385364 ┆ 1.848935 │
│ 1 ┆ -0.109353 ┆ 2.698282 ┆ -1.080152 │
│ 3 ┆ -0.50849 ┆ -3.916603 ┆ -4.422061 │
│ 1 ┆ -1.273324 ┆ -0.0369 ┆ -3.589582 │
│ 2 ┆ -0.926717 ┆ -2.338863 ┆ 1.815631 │
│ 1 ┆ -0.906546 ┆ 1.224383 ┆ 0.145775 │
│ 3 ┆ -0.459142 ┆ 1.01782 ┆ -1.900232 │
│ 1 ┆ 0.371648 ┆ 3.509538 ┆ -0.654882 │
│ 2 ┆ 0.958224 ┆ -2.368278 ┆ 0.618381 │
│ 1 ┆ -0.235959 ┆ 1.326076 ┆ 5.704977 │
└──────┴───────────┴───────────┴───────────┘
""")
I'm not sure if it's possible to do this in a single pass using existing groupby/join methods.
(It seems like it can be expressed as a non-equi join which are on the TODO list.)
We can build a sequence of LazyFrames which allows Polars to run the computations in parallel:
pl_df = pl.from_pandas(df).lazy()
dates = pl_df.select("date").unique().collect().iter_rows()
skews = (
pl_df.filter(pl.col("date") <= date)
.select(
pl.lit(date, int).alias("date"),
pl.exclude("date", "id", "volvol").skew().prefix("rsk_")
)
for date in dates
)
(pl_df
.with_columns(
pl.exclude("date", "id", "volvol").skew().over("date").prefix("sk_")
)
.join(
pl.concat(skews),
on = "date",
how = "left"
)
.collect()
)
shape: (20, 10)
┌──────┬───────────┬───────────┬───────────┬─────────────┬─────────────┬─────────────┬──────────────┬──────────────┬──────────────┐
│ date ┆ feature1 ┆ feature2 ┆ feature3 ┆ sk_feature1 ┆ sk_feature2 ┆ sk_feature3 ┆ rsk_feature1 ┆ rsk_feature2 ┆ rsk_feature3 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪═══════════╪═══════════╪═══════════╪═════════════╪═════════════╪═════════════╪══════════════╪══════════════╪══════════════╡
│ 3 ┆ -0.71728 ┆ 0.149132 ┆ 1.001409 ┆ 0.511999 ┆ -1.094016 ┆ -0.607669 ┆ 0.366071 ┆ -0.106414 ┆ 0.019972 │
│ 1 ┆ 1.303821 ┆ -1.621259 ┆ -0.261638 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 │
│ 1 ┆ -0.180873 ┆ -0.777132 ┆ 0.967881 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 │
│ 1 ┆ -0.438287 ┆ 0.304117 ┆ 0.736307 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 │
│ 2 ┆ -0.489215 ┆ -1.239215 ┆ 2.767833 ┆ -0.062774 ┆ 0.009747 ┆ -0.517757 ┆ 0.052201 ┆ 0.364357 ┆ 0.214142 │
│ 3 ┆ 0.209194 ┆ -1.106649 ┆ -0.73973 ┆ 0.511999 ┆ -1.094016 ┆ -0.607669 ┆ 0.366071 ┆ -0.106414 ┆ 0.019972 │
│ 1 ┆ 0.516778 ┆ 0.26425 ┆ 2.80841 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 │
│ 1 ┆ 0.482522 ┆ 1.146356 ┆ -0.752033 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 │
│ 3 ┆ -0.760755 ┆ 0.211106 ┆ 0.00278 ┆ 0.511999 ┆ -1.094016 ┆ -0.607669 ┆ 0.366071 ┆ -0.106414 ┆ 0.019972 │
│ 2 ┆ 0.665081 ┆ -1.140646 ┆ 3.009091 ┆ -0.062774 ┆ 0.009747 ┆ -0.517757 ┆ 0.052201 ┆ 0.364357 ┆ 0.214142 │
│ 3 ┆ 0.027294 ┆ 1.385364 ┆ 1.848935 ┆ 0.511999 ┆ -1.094016 ┆ -0.607669 ┆ 0.366071 ┆ -0.106414 ┆ 0.019972 │
│ 1 ┆ -0.109353 ┆ 2.698282 ┆ -1.080152 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 │
│ 3 ┆ -0.50849 ┆ -3.916603 ┆ -4.422061 ┆ 0.511999 ┆ -1.094016 ┆ -0.607669 ┆ 0.366071 ┆ -0.106414 ┆ 0.019972 │
│ 1 ┆ -1.273324 ┆ -0.0369 ┆ -3.589582 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 │
│ 2 ┆ -0.926717 ┆ -2.338863 ┆ 1.815631 ┆ -0.062774 ┆ 0.009747 ┆ -0.517757 ┆ 0.052201 ┆ 0.364357 ┆ 0.214142 │
│ 1 ┆ -0.906546 ┆ 1.224383 ┆ 0.145775 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 │
│ 3 ┆ -0.459142 ┆ 1.01782 ┆ -1.900232 ┆ 0.511999 ┆ -1.094016 ┆ -0.607669 ┆ 0.366071 ┆ -0.106414 ┆ 0.019972 │
│ 1 ┆ 0.371648 ┆ 3.509538 ┆ -0.654882 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 │
│ 2 ┆ 0.958224 ┆ -2.368278 ┆ 0.618381 ┆ -0.062774 ┆ 0.009747 ┆ -0.517757 ┆ 0.052201 ┆ 0.364357 ┆ 0.214142 │
│ 1 ┆ -0.235959 ┆ 1.326076 ┆ 5.704977 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 ┆ 0.088205 ┆ 0.25369 ┆ 0.715095 │
└──────┴───────────┴───────────┴───────────┴─────────────┴─────────────┴─────────────┴──────────────┴──────────────┴──────────────┘
As we're using the Lazy API here, pl.concat(skews)
is parallelized.