This is a continuation of a question I asked earlier, but I thought it deserved its own question.
Is it possible to use a ufunc in polars with 3 or more expressions, where the expressions are of different lengths?
This can be currently done with 2 expressions.
g(pl.col('x'), pl.col('y'))
For 3 or more expressions, the current workaround is to wrap the 3 columns in a struct, then map using a lambda. However, this is only possible when x
, y
and z
have the same lengths, otherwise the struct cannot be created.
pl.struct('x', 'y', 'z').map(lambda t: g(t.struct['x'], t.struct['y'], t.struct['z']))
For illustrative purposes, I have replicated the functionality of join_asof
with 2 expressions.
X: shape: (5, 1)
┌─────┐
│ x │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 3 │
│ 5 │
│ 7 │
│ 9 │
└─────┘
Y: shape: (3, 1)
┌─────┐
│ y │
│ --- │
│ i64 │
╞═════╡
│ 2 │
│ 6 │
│ 10 │
└─────┘
shape: (5, 2)
┌─────┬───────────────┐
│ x ┆ x_join_asof_y │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═══════════════╡
│ 1 ┆ 2 │
│ 3 ┆ 6 │
│ 5 ┆ 6 │
│ 7 ┆ 10 │
│ 9 ┆ 10 │
└─────┴───────────────┘
import numba as nb
import polars as pl
@nb.guvectorize([(nb.int64[:], nb.int64[:], nb.int64[:])], '(n),(m)->(n)', nopython=True)
def join_asof(x, y, results):
j = 0
for i in range(len(x)):
while y[j] <= x[i]:
j += 1
results[i] = y[j]
xdf = pl.DataFrame({ 'x': [1, 3, 5, 7, 9] })
ydf = pl.DataFrame({ 'y': [2, 6, 10] })
(
xdf
.lazy()
.with_context(
ydf
.lazy()
)
.with_columns(
join_asof(pl.col('x'), pl.col('y')).alias('x_join_asof_y')
)
.collect()
)
It's basically the same trick but with implode
on the inside of the struct
creation and explode
in the lambda.
Say we had
@nb.guvectorize([(nb.int64[:], nb.int64[:], nb.int64[:], nb.int64[:])],
'(n),(m),(o)->(n)', nopython=True)
def join_asof(x, y, o, results):
j = 0
for i in range(len(x)):
while y[j] <= x[i]:
j += 1
z=max(len(o)-1, i)
results[i] = y[j] + o[z]
xdf = pl.DataFrame({ 'x': [1, 3, 5, 7, 9] })
ydf = pl.DataFrame({ 'y': [2, 6, 10],
'z': [2, 5, 7]})
Then we can do this:
(
xdf
.lazy()
.with_context(
ydf
.lazy()
)
.with_columns(
newcol=pl.struct(
pl.col('x').implode(),
pl.col('y').implode(),
pl.col('z').implode()
)
.map(lambda t: join_asof(
t.struct['x'].explode(),
t.struct['y'].explode(),
t.struct['z'].explode()
))
)
.collect()
)
Or better yet you can make a helper function in conjunction with your ufunc to make using it more ergonomic
def join_asof_expr(x, y, z):
if isinstance(x, str):
x=pl.col(x)
if isinstance(y, str):
y=pl.col(y)
if isinstance(z, str):
z=pl.col(z)
return (
pl.struct(
## alias these here so it doesn't matter what
# they're actually named in the parent df
x.implode().alias('x'),
y.implode().alias('y'),
z.implode().alias('z')
)
.map(lambda t: join_asof(
t.struct['x'].explode(),
t.struct['y'].explode(),
t.struct['z'].explode()
))
)
Having that allows you do this:
(
xdf
.lazy()
.with_context(
ydf
.lazy()
)
.with_columns(
newcol=join_asof_expr('x','y','z')
)
.collect()
)
or, since our helper function converts string inputs to columns but otherwise assumes expression inputs, you can monkey patch it and do:
pl.Expr.my_expr=join_asof_expr
(
xdf
.lazy()
.with_context(
ydf
.lazy()
)
.with_columns(
newcol=pl.col('x').my_expr('y','z')
)
.collect()
)
Since the helper function aliases the column in the struct, it still works with other column names. Only the order matters.
(
xdf
.lazy()
.with_context(
ydf
.lazy()
)
.with_columns(
newcol=pl.col('x').my_expr((pl.col('y')+2).alias('blah'),'z')
)
.collect()
)