I have a task on hand where I want to perform regressions of transformed columns on a set of specified columns in a polars dataframe. The transformation and set of independent columns are all controlled by a specs
dict.
Below is a simplified mini example for illustrating purposes.
from functools import partial
import polars as pl
import numpy as np
def ols_fitted(s: pl.Series, yvar: str, xvars: list[str]) -> pl.Series:
df = s.struct.unnest()
y = df[yvar].to_numpy()
X = df[xvars].to_numpy()
fitted = np.dot(X, np.linalg.lstsq(X, y, rcond=None)[0])
return pl.Series(values=fitted, nan_to_null=True)
df = pl.DataFrame(
{
"date": [1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
"id": [1, 1, 1, 2, 2, 2, 2, 3, 3, 3],
"y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"g1": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
"g2": [1, 1, 1, 2, 2, 2, 3, 3, 3, 3],
"g3": [1, 1, 2, 2, 2, 3, 3, 4, 4, 4],
"x1": [2, 5, 4, 7, 3, 2, 5, 6, 7, 2],
"x2": [1, 5, 3, 4, 5, 6, 4, 3, 2, 1],
"x3": [3, 6, 8, 6, 4, 7, 5, 4, 8, 1],
}
)
specs = {
"first": {"yvar": "y", "gvars": ["g1"], "xvars": ["x1"]},
"second": {"yvar": "y", "gvars": ["g1", "g2"], "xvars": ["x1", "x2"]},
"third": {"yvar": "y", "gvars": ["g2", "g3"], "xvars": ["x2", "x3"]},
}
df.with_columns(
pl.struct(
(
pl.col(specs[specnm]["yvar"])
- pl.col(specs[specnm]["yvar"]).mean().over(specs[specnm]["gvars"])
).abs(),
*specs[specnm]["xvars"],
)
.map_elements(
partial(
ols_fitted, yvar=specs[specnm]["yvar"], xvars=specs[specnm]["xvars"]
)
)
.over("date", "id")
.alias(f"fitted_{specnm}")
for specnm in list(specs.keys())
)
However, I got the error below:
InvalidOperationError: window expression not allowed in aggregation
Not sure why over is not supported within aggregation context. Would be very convenient if it does like in my example.
So, my real question is how to handle this in my particular case? And, if it cannot be handled, is there any alternative ways to make my code work in a systematic way?
I think I figured it out. When you can't use windows functions (over
) you just have to be explicit and do a group_by
, map_elements
and explode
by foot:
df.lazy().with_columns(
pl.struct(
(
pl.col(specs[specnm]["yvar"])
- pl.col(specs[specnm]["yvar"])
.mean()
.over(specs[specnm]["gvars"])
).abs(),
*specs[specnm]["xvars"],
).alias(f"pre_{specnm}")
for specnm in list(specs.keys())
).group_by("date", "id", maintain_order=True).agg(
*[
pl.col(f"pre_{specnm}").map_elements(
partial(
ols_fitted,
yvar=specs[specnm]["yvar"],
xvars=specs[specnm]["xvars"],
)
)
for specnm in list(specs.keys())
],
pl.exclude([f"pre_{specnm}" for specnm in list(specs.keys())]),
).explode(
pl.exclude("date", "id")
).collect()
Note that I made it lazy, in line with best practices.