Search code examples
pythonpython-polars

rolling_apply with Null as return value in polars


I want to use some custom function in rolling_apply in polars. However, I met TypeError when doing below.

def ts_rank(expr: pl.Expr, window: int) -> pl.Expr:
    res = expr.cast(pl.Float64).rolling_apply(
        lambda s: s.rank(method='average', descending=False)[-1]/s.is_not_null().sum(), 
        window_size = window,
        min_periods = window//2).over('a')
    return res
df = pl.DataFrame({"a": [1, 1, 1, 1, 2, 2, 2, 2], 
                   "b": [None, None, None, 1, 4, 2, 3, 8]})
df.with_columns(ts_rank(pl.col('b'),4).alias('rank'))

I got this error:

thread '<unnamed>' panicked at 'python function failed: PyErr { type: <class 'TypeError'>, value: TypeError("unsupported operand type(s) for /: 'NoneType' and 'int'"), traceback: Some(<traceback object at 0x7fe3407814c0>) }', src/expr/general.rs:641:22

---------------------------------------------------------------------------
PanicException                            Traceback (most recent call last)
Cell In[37], line 9
      6     return res
      7 df = pl.DataFrame({"a": [1, 1, 1, 1, 2, 2, 2, 2], 
      8                    "b": [None, None, None, 1, 4, 2, 3, 8]})
----> 9 df.with_columns(ts_rank(pl.col('b'),4).alias('rank'))

File ~/test/lib/python3.9/site-packages/polars/dataframe/frame.py:7631, in DataFrame.with_columns(self, *exprs, **named_exprs)
   7482 def with_columns(
   7483     self,
   7484     *exprs: IntoExpr | Iterable[IntoExpr],
   7485     **named_exprs: IntoExpr,
   7486 ) -> DataFrame:
   7487     """
   7488     Add columns to this DataFrame.
   7489 
   (...)
   7628 
   7629     """
   7630     return (
-> 7631         self.lazy()
   7632         .with_columns(*exprs, **named_exprs)
   7633         .collect(no_optimization=True)
   7634     )

File ~/test/lib/python3.9/site-packages/polars/utils/deprecation.py:93, in deprecate_renamed_parameter..decorate..wrapper(*args, **kwargs)
     88 @wraps(function)
     89 def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
     90     _rename_keyword_argument(
     91         old_name, new_name, kwargs, function.__name__, version
     92     )
---> 93     return function(*args, **kwargs)

File ~/test/lib/python3.9/site-packages/polars/lazyframe/frame.py:1695, in LazyFrame.collect(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, no_optimization, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming)
   1683     comm_subplan_elim = False
   1685 ldf = self._ldf.optimization_toggle(
   1686     type_coercion,
   1687     predicate_pushdown,
   (...)
   1693     streaming,
   1694 )
-> 1695 return wrap_df(ldf.collect())

PanicException: python function failed: PyErr { type: , value: TypeError("unsupported operand type(s) for /: 'NoneType' and 'int'"), traceback: Some() }

What I have done:

I changed to this function:

def ts_rank(expr: pl.Expr, window: int) -> pl.Expr:
    def rank(s):
        tmp = s.rank(method='average', descending=False)[-1]
        if not tmp:
            return None
        return tmp/s.is_not_null().sum()
    res = expr.cast(pl.Float64).rolling_apply(
        rank, 
        window_size = window,
        min_periods = window//2).over('a')
    return res
df = pl.DataFrame({"a": [1, 1, 1, 1, 2, 2, 2, 2], "b": [None, None, None, 1, 4, 2, 3, 8]})
df.with_columns(ts_rank(pl.col('b'),4).alias('rank'))

And I got this error:

thread '<unnamed>' panicked at 'PyErr { type: <class 'TypeError'>, value: TypeError('must be real number, not NoneType'), traceback: None }', src/expr/general.rs:739:33
---------------------------------------------------------------------------
PanicException                            Traceback (most recent call last)
Cell In[35], line 14
     12     return res
     13 df = pl.DataFrame({"a": [1, 1, 1, 1, 2, 2, 2, 2], "b": [None, None, None, 1, 4, 2, 3, 8]})
---> 14 df.with_columns(ts_rank(pl.col('b'),4).alias('rank'))

File ~/test/lib/python3.9/site-packages/polars/dataframe/frame.py:7631, in DataFrame.with_columns(self, *exprs, **named_exprs)
   7482 def with_columns(
   7483     self,
   7484     *exprs: IntoExpr | Iterable[IntoExpr],
   7485     **named_exprs: IntoExpr,
   7486 ) -> DataFrame:
   7487     """
   7488     Add columns to this DataFrame.
   7489 
   (...)
   7628 
   7629     """
   7630     return (
-> 7631         self.lazy()
   7632         .with_columns(*exprs, **named_exprs)
   7633         .collect(no_optimization=True)
   7634     )

File ~/test/lib/python3.9/site-packages/polars/utils/deprecation.py:93, in deprecate_renamed_parameter..decorate..wrapper(*args, **kwargs)
     88 @wraps(function)
     89 def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
     90     _rename_keyword_argument(
     91         old_name, new_name, kwargs, function.__name__, version
     92     )
---> 93     return function(*args, **kwargs)

File ~/test/lib/python3.9/site-packages/polars/lazyframe/frame.py:1695, in LazyFrame.collect(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, no_optimization, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming)
   1683     comm_subplan_elim = False
   1685 ldf = self._ldf.optimization_toggle(
   1686     type_coercion,
   1687     predicate_pushdown,
   (...)
   1693     streaming,
   1694 )
-> 1695 return wrap_df(ldf.collect())

PanicException: PyErr { type: , value: TypeError('must be real number, not NoneType'), traceback: None }

The problem is return None.

So my first and foremost question is how to deal with the NoneType return value in the custom apply function.

And is this a correct 'polars' way to do rolling_rank? (For my own purpose, I have to write it as an Expr, not using groupby_rolling.)

--------Version info---------
Polars:              0.18.15
Index type:          UInt32
Platform:            Linux-5.3.0-28-generic-x86_64-with-glibc2.27
Python:              3.9.16 (main, Mar  8 2023, 14:00:05) 
[GCC 11.2.0]

----Optional dependencies----
adbc_driver_sqlite:  <not installed>
cloudpickle:         <not installed>
connectorx:          <not installed>
deltalake:           <not installed>
fsspec:              <not installed>
matplotlib:          3.7.2
numpy:               1.24.4
pandas:              2.0.3
pyarrow:             12.0.1
pydantic:            <not installed>
sqlalchemy:          <not installed>
xlsx2csv:            <not installed>
xlsxwriter:          <not installed>

Solution

  • Directly using a None or any other non-real type won't work in this case, however using a pl.Series with a dtype of pl.Float64 will work.

    You can wrap the needed None in a new pl.Series.

    pl.Series(values=[None], dtype=pl.Float64)
    

    Here it is applied in your ts_rank function.

    import polars as pl
    
    
    def ts_rank(expr: pl.Expr, window: int) -> pl.Expr:
        def rank(s):
            tmp = s.rank(method="average", descending=False)[-1]
            if not tmp:
                return pl.Series(values=[None], dtype=pl.Float64)
            return tmp / s.is_not_null().sum()
    
        res = (
            expr.cast(pl.Float64)
            .rolling_apply(rank, window_size=window, min_periods=window // 2)
            .over("a")
        )
        return res