I want to use some custom function in rolling_apply in polars. However, I met TypeError when doing below.
def ts_rank(expr: pl.Expr, window: int) -> pl.Expr:
res = expr.cast(pl.Float64).rolling_apply(
lambda s: s.rank(method='average', descending=False)[-1]/s.is_not_null().sum(),
window_size = window,
min_periods = window//2).over('a')
return res
df = pl.DataFrame({"a": [1, 1, 1, 1, 2, 2, 2, 2],
"b": [None, None, None, 1, 4, 2, 3, 8]})
df.with_columns(ts_rank(pl.col('b'),4).alias('rank'))
I got this error:
thread '<unnamed>' panicked at 'python function failed: PyErr { type: <class 'TypeError'>, value: TypeError("unsupported operand type(s) for /: 'NoneType' and 'int'"), traceback: Some(<traceback object at 0x7fe3407814c0>) }', src/expr/general.rs:641:22
---------------------------------------------------------------------------
PanicException Traceback (most recent call last)
Cell In[37], line 9
6 return res
7 df = pl.DataFrame({"a": [1, 1, 1, 1, 2, 2, 2, 2],
8 "b": [None, None, None, 1, 4, 2, 3, 8]})
----> 9 df.with_columns(ts_rank(pl.col('b'),4).alias('rank'))
File ~/test/lib/python3.9/site-packages/polars/dataframe/frame.py:7631, in DataFrame.with_columns(self, *exprs, **named_exprs)
7482 def with_columns(
7483 self,
7484 *exprs: IntoExpr | Iterable[IntoExpr],
7485 **named_exprs: IntoExpr,
7486 ) -> DataFrame:
7487 """
7488 Add columns to this DataFrame.
7489
(...)
7628
7629 """
7630 return (
-> 7631 self.lazy()
7632 .with_columns(*exprs, **named_exprs)
7633 .collect(no_optimization=True)
7634 )
File ~/test/lib/python3.9/site-packages/polars/utils/deprecation.py:93, in deprecate_renamed_parameter..decorate..wrapper(*args, **kwargs)
88 @wraps(function)
89 def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
90 _rename_keyword_argument(
91 old_name, new_name, kwargs, function.__name__, version
92 )
---> 93 return function(*args, **kwargs)
File ~/test/lib/python3.9/site-packages/polars/lazyframe/frame.py:1695, in LazyFrame.collect(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, no_optimization, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming)
1683 comm_subplan_elim = False
1685 ldf = self._ldf.optimization_toggle(
1686 type_coercion,
1687 predicate_pushdown,
(...)
1693 streaming,
1694 )
-> 1695 return wrap_df(ldf.collect())
PanicException: python function failed: PyErr { type: , value: TypeError("unsupported operand type(s) for /: 'NoneType' and 'int'"), traceback: Some() }
What I have done:
I changed to this function:
def ts_rank(expr: pl.Expr, window: int) -> pl.Expr:
def rank(s):
tmp = s.rank(method='average', descending=False)[-1]
if not tmp:
return None
return tmp/s.is_not_null().sum()
res = expr.cast(pl.Float64).rolling_apply(
rank,
window_size = window,
min_periods = window//2).over('a')
return res
df = pl.DataFrame({"a": [1, 1, 1, 1, 2, 2, 2, 2], "b": [None, None, None, 1, 4, 2, 3, 8]})
df.with_columns(ts_rank(pl.col('b'),4).alias('rank'))
And I got this error:
thread '<unnamed>' panicked at 'PyErr { type: <class 'TypeError'>, value: TypeError('must be real number, not NoneType'), traceback: None }', src/expr/general.rs:739:33
---------------------------------------------------------------------------
PanicException Traceback (most recent call last)
Cell In[35], line 14
12 return res
13 df = pl.DataFrame({"a": [1, 1, 1, 1, 2, 2, 2, 2], "b": [None, None, None, 1, 4, 2, 3, 8]})
---> 14 df.with_columns(ts_rank(pl.col('b'),4).alias('rank'))
File ~/test/lib/python3.9/site-packages/polars/dataframe/frame.py:7631, in DataFrame.with_columns(self, *exprs, **named_exprs)
7482 def with_columns(
7483 self,
7484 *exprs: IntoExpr | Iterable[IntoExpr],
7485 **named_exprs: IntoExpr,
7486 ) -> DataFrame:
7487 """
7488 Add columns to this DataFrame.
7489
(...)
7628
7629 """
7630 return (
-> 7631 self.lazy()
7632 .with_columns(*exprs, **named_exprs)
7633 .collect(no_optimization=True)
7634 )
File ~/test/lib/python3.9/site-packages/polars/utils/deprecation.py:93, in deprecate_renamed_parameter..decorate..wrapper(*args, **kwargs)
88 @wraps(function)
89 def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
90 _rename_keyword_argument(
91 old_name, new_name, kwargs, function.__name__, version
92 )
---> 93 return function(*args, **kwargs)
File ~/test/lib/python3.9/site-packages/polars/lazyframe/frame.py:1695, in LazyFrame.collect(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, no_optimization, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming)
1683 comm_subplan_elim = False
1685 ldf = self._ldf.optimization_toggle(
1686 type_coercion,
1687 predicate_pushdown,
(...)
1693 streaming,
1694 )
-> 1695 return wrap_df(ldf.collect())
PanicException: PyErr { type: , value: TypeError('must be real number, not NoneType'), traceback: None }
The problem is return None
.
So my first and foremost question is how to deal with the NoneType return value in the custom apply function.
And is this a correct 'polars' way to do rolling_rank? (For my own purpose, I have to write it as an Expr, not using groupby_rolling.)
--------Version info---------
Polars: 0.18.15
Index type: UInt32
Platform: Linux-5.3.0-28-generic-x86_64-with-glibc2.27
Python: 3.9.16 (main, Mar 8 2023, 14:00:05)
[GCC 11.2.0]
----Optional dependencies----
adbc_driver_sqlite: <not installed>
cloudpickle: <not installed>
connectorx: <not installed>
deltalake: <not installed>
fsspec: <not installed>
matplotlib: 3.7.2
numpy: 1.24.4
pandas: 2.0.3
pyarrow: 12.0.1
pydantic: <not installed>
sqlalchemy: <not installed>
xlsx2csv: <not installed>
xlsxwriter: <not installed>
Directly using a None
or any other non-real type won't work in this case, however using a pl.Series
with a dtype of pl.Float64
will work.
You can wrap the needed None
in a new pl.Series
.
pl.Series(values=[None], dtype=pl.Float64)
Here it is applied in your ts_rank
function.
import polars as pl
def ts_rank(expr: pl.Expr, window: int) -> pl.Expr:
def rank(s):
tmp = s.rank(method="average", descending=False)[-1]
if not tmp:
return pl.Series(values=[None], dtype=pl.Float64)
return tmp / s.is_not_null().sum()
res = (
expr.cast(pl.Float64)
.rolling_apply(rank, window_size=window, min_periods=window // 2)
.over("a")
)
return res