If I have a polars Dataframe and want to perform masked operations, I currently see two options:
# create data
df = pl.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], schema = ['a', 'b']).lazy()
# create a second dataframe for added fun
df2 = pl.DataFrame([[8, 6, 7, 5], [15, 16, 17, 18]], schema=["b", "d"]).lazy()
# define mask
mask = pl.col('a').is_between(2, 3)
masked_df = df.filter(mask)
masked_df = masked_df.with_columns( # calculate some columns
[
pl.col("a").sin().alias("new_1"),
pl.col("a").cos().alias("new_2"),
(pl.col("a") / pl.col("b")).alias("new_3"),
]
).join( # throw a join into the mix
df2, on="b", how="left"
)
res = df.join(masked_df, how="left", on=["a", "b"])
print(res.collect())
res = df.with_columns( # calculate some columns - we have to add `pl.when(mask).then()` to each column now
[
pl.when(mask).then(pl.col("a").sin()).alias("new_1"),
pl.when(mask).then(pl.col("a").cos()).alias("new_2"),
pl.when(mask).then(pl.col("a") / pl.col("b")).alias("new_3"),
]
).join( # we have to construct a convoluted back-and-forth join to apply the mask to the join
df2.join(df.filter(mask), on="b", how="semi"), on="b", how="left"
)
print(res.collect())
shape: (4, 6)
┌─────┬─────┬──────────┬───────────┬──────────┬──────┐
│ a ┆ b ┆ new_1 ┆ new_2 ┆ new_3 ┆ d │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ f64 ┆ f64 ┆ f64 ┆ i64 │
╞═════╪═════╪══════════╪═══════════╪══════════╪══════╡
│ 1 ┆ 5 ┆ null ┆ null ┆ null ┆ null │
│ 2 ┆ 6 ┆ 0.909297 ┆ -0.416147 ┆ 0.333333 ┆ 16 │
│ 3 ┆ 7 ┆ 0.14112 ┆ -0.989992 ┆ 0.428571 ┆ 17 │
│ 4 ┆ 8 ┆ null ┆ null ┆ null ┆ null │
└─────┴─────┴──────────┴───────────┴──────────┴──────┘
Most of the time, option 2 will be faster, but it gets pretty verbose and is generally harder to read than option 1 when any sort of complexity is involved.
Is there a way to apply a mask more generically to cover multiple subsequent operations?
You can use a struct
with an unnest
Your dfs weren't consistent between being lazy and eager so I'm going to make them both lazy
(
df
.join(df2, on='b')
.with_columns(pl.when(mask).then(
pl.struct(
pl.col("a").sin().alias("new_1"),
pl.col("a").cos().alias("new_2"),
(pl.col("a") / pl.col("b").cast(pl.Float64()))
.alias("new_3")
)
.alias('allcols')
))
.unnest('allcols')
.with_columns(
[pl.when(mask).then(x)
for x in df2.columns if x not in df]
)
.collect()
)
I think that's the heart of your question is how to write when
then
with multiple column outputs which is covered by the first with_columns
and then the second with_columns
covers the quasi-semi join value replacement behavior.
Another way you can write it is to first create a list of the columns in df2 that you want to be subject to the mask and then put those in the struct. The unsightly thing is that you have to then exclude those columns before you do the unnest
df2_mask_cols=[x for x in df2.columns if x not in df.columns]
(
df
.join(df2, on='b')
.with_columns(pl.when(mask).then(
pl.struct([
pl.col("a").sin().alias("new_1"),
pl.col("a").cos().alias("new_2"),
(pl.col("a") / pl.col("b").cast(pl.Float64()))
.alias("new_3")
] + df2_mask_cols
)
.alias('allcols'))
)
.select(pl.exclude(df2_mask_cols))
.unnest('allcols')
.collect()
)
Surprisingly, this approach was fastest:
(
df
.join(df2, on='b')
.with_columns(
pl.col("a").sin().alias("new_1"),
pl.col("a").cos().alias("new_2"),
(pl.col("a") /pl.col("b").cast(pl.Float64())).alias("new_3")
)
.with_columns(pl.when(mask).then(pl.exclude(df.columns)))
.collect()
)