ufunc with 3 or more expressions of different lengths

This is a continuation of a question I asked earlier, but I thought it deserved its own question.

Is it possible to use a ufunc in polars with 3 or more expressions, where the expressions are of different lengths?

This can be currently done with 2 expressions.

g(pl.col('x'), pl.col('y'))

For 3 or more expressions, the current workaround is to wrap the 3 columns in a struct, then map using a lambda. However, this is only possible when x, y and z have the same lengths, otherwise the struct cannot be created.

pl.struct('x', 'y', 'z').map(lambda t: g(t.struct['x'], t.struct['y'], t.struct['z']))

For illustrative purposes, I have replicated the functionality of join_asof with 2 expressions.

X: shape: (5, 1)
┌─────┐
│ x   │
│ --- │
│ i64 │
╞═════╡
│ 1   │
│ 3   │
│ 5   │
│ 7   │
│ 9   │
└─────┘
Y: shape: (3, 1)
┌─────┐
│ y   │
│ --- │
│ i64 │
╞═════╡
│ 2   │
│ 6   │
│ 10  │
└─────┘
shape: (5, 2)
┌─────┬───────────────┐
│ x   ┆ x_join_asof_y │
│ --- ┆ ---           │
│ i64 ┆ i64           │
╞═════╪═══════════════╡
│ 1   ┆ 2             │
│ 3   ┆ 6             │
│ 5   ┆ 6             │
│ 7   ┆ 10            │
│ 9   ┆ 10            │
└─────┴───────────────┘

import numba as nb
import polars as pl

@nb.guvectorize([(nb.int64[:], nb.int64[:], nb.int64[:])], '(n),(m)->(n)', nopython=True)
def join_asof(x, y, results):
    j = 0
    for i in range(len(x)):
        while y[j] <= x[i]:
            j += 1
        results[i] = y[j]

xdf = pl.DataFrame({ 'x': [1, 3, 5, 7, 9] })
ydf = pl.DataFrame({ 'y': [2, 6, 10] })

(
    xdf
    .lazy()
    .with_context(
        ydf
        .lazy()
    )
    .with_columns(
        join_asof(pl.col('x'), pl.col('y')).alias('x_join_asof_y')
    )
    .collect()
)

Solution

It's basically the same trick but with implode on the inside of the struct creation and explode in the lambda.

Say we had

@nb.guvectorize([(nb.int64[:], nb.int64[:], nb.int64[:], nb.int64[:])], 
               '(n),(m),(o)->(n)', nopython=True)
def join_asof(x, y, o, results):
    j = 0
    for i in range(len(x)):
        while y[j] <= x[i]:
            j += 1
        z=max(len(o)-1, i)
        results[i] = y[j] + o[z]

xdf = pl.DataFrame({ 'x': [1, 3, 5, 7, 9] })
ydf = pl.DataFrame({ 'y': [2, 6, 10],
                     'z': [2, 5, 7]})

Then we can do this:

(
    xdf
    .lazy()
    .with_context(
        ydf
        .lazy()
    )

    .with_columns(
        newcol=pl.struct(
            pl.col('x').implode(),
            pl.col('y').implode(),
            pl.col('z').implode()
        )
        .map(lambda t: join_asof(
            t.struct['x'].explode(), 
            t.struct['y'].explode(), 
            t.struct['z'].explode()
            ))
    )
    .collect()
)

Or better yet you can make a helper function in conjunction with your ufunc to make using it more ergonomic

def join_asof_expr(x, y, z):
    if isinstance(x, str):
        x=pl.col(x)
    if isinstance(y, str):
        y=pl.col(y)
    if isinstance(z, str):
        z=pl.col(z)      
    return (
        pl.struct(
            ## alias these here so it doesn't matter what 
            # they're actually named in the parent df
            x.implode().alias('x'), 
            y.implode().alias('y'),
            z.implode().alias('z')
            )
        .map(lambda t: join_asof(
            t.struct['x'].explode(), 
            t.struct['y'].explode(), 
            t.struct['z'].explode()
        ))
    )

Having that allows you do this:

(
    xdf
    .lazy()
    .with_context(
        ydf
        .lazy()
    )

    .with_columns(
        newcol=join_asof_expr('x','y','z')
    )
    .collect()
)

or, since our helper function converts string inputs to columns but otherwise assumes expression inputs, you can monkey patch it and do:

pl.Expr.my_expr=join_asof_expr
(
    xdf
    .lazy()
    .with_context(
        ydf
        .lazy()
    )

    .with_columns(
        newcol=pl.col('x').my_expr('y','z')
    )
    .collect()
)

Since the helper function aliases the column in the struct, it still works with other column names. Only the order matters.

(
    xdf
    .lazy()
    .with_context(
        ydf
        .lazy()
    )

    .with_columns(
        newcol=pl.col('x').my_expr((pl.col('y')+2).alias('blah'),'z')
    )
    .collect()
)