I'm a new polars user. Pandas has df.dropna. I need to replace this functionality, but I haven't found a dropna in polars. Searching for drona
currently yields no results in the Polars User Guide.
My specific problem: convert the following statement from pandas to polars
df.dropna(subset=list_of_vars, thresh=1)
I think I should use df.filter
. I do not know a-priori what will be in list_of_vars
, so making a set of |
filters is a bit tricky. all vars in list_of_vars
are columns in the dataframe
input
import polars as pl
df = pl.DataFrame(
{
'col1':[0,float('nan'),2,3],
'col2':[0,float('nan'),float('nan'),3],
'col3':[0,1,2,3],
'col4':[float('nan'),float('nan'),float('nan'),float('nan')]
},
strict=False
)
df
list_of_vars = ['col1', 'col2']
Desired output:
Keep only rows in the dataframe where the value of at least one of the columns in list_of_vars
is not NaN/null.
┌──────┬──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 ┆ col4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ i64 ┆ f64 │
╞══════╪══════╪══════╪══════╡
│ 0.0 ┆ 0.0 ┆ 0 ┆ NaN │
│ 2.0 ┆ NaN ┆ 2 ┆ NaN │
│ 3.0 ┆ 3.0 ┆ 3 ┆ NaN │
└──────┴──────┴──────┴──────┘
In this case, rows with a good value from col1 or col2 are retained. Row 3, with a good value only from col3, is dropped.
First, let's expand the data to include null
values in Polars, so that we can show how to filter both null
and NaN
if you need to. (You can always change the logic below if you don't need it.)
df = pl.DataFrame(
{
'col1':[0,float('nan'),2,3, None],
'col2':[0,float('nan'),float('nan'),3, None],
'col3':[0.0,1,2,3,4],
'col4':[float('nan'),float('nan'),float('nan'),float('nan'), 5]
},
strict=False
)
df
shape: (5, 4)
┌──────┬──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 ┆ col4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪══════╡
│ 0.0 ┆ 0.0 ┆ 0.0 ┆ NaN │
│ NaN ┆ NaN ┆ 1.0 ┆ NaN │
│ 2.0 ┆ NaN ┆ 2.0 ┆ NaN │
│ 3.0 ┆ 3.0 ┆ 3.0 ┆ NaN │
│ null ┆ null ┆ 4.0 ┆ 5.0 │
└──────┴──────┴──────┴──────┘
df.dropna(subset=list_of_vars, thresh=1)
Keep only rows in the dataframe where the value of at least one of the columns in list_of_vars is not NaN/null.
If we're looking to drop both null
values and NaN
values in Polars, we can do this in a single filter step. (For comparison, I'll also include the result from Pandas.)
list_of_vars = ['col1', 'col2']
thresh = 1
df.filter(
pl.sum_horizontal(
pl.col(list_of_vars).is_not_null() & pl.col(list_of_vars).is_not_nan()
) >= thresh
)
df.to_pandas().dropna(subset=list_of_vars, thresh=thresh)
shape: (3, 4)
┌──────┬──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 ┆ col4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪══════╡
│ 0.0 ┆ 0.0 ┆ 0.0 ┆ NaN │
│ 2.0 ┆ NaN ┆ 2.0 ┆ NaN │
│ 3.0 ┆ 3.0 ┆ 3.0 ┆ NaN │
└──────┴──────┴──────┴──────┘
>>> df.to_pandas().dropna(subset=list_of_vars, thresh=thresh)
col1 col2 col3 col4
0 0.0 0.0 0.0 NaN
2 2.0 NaN 2.0 NaN
3 3.0 3.0 3.0 NaN
And for threshold of 2:
shape: (2, 4)
┌──────┬──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 ┆ col4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪══════╡
│ 0.0 ┆ 0.0 ┆ 0.0 ┆ NaN │
│ 3.0 ┆ 3.0 ┆ 3.0 ┆ NaN │
└──────┴──────┴──────┴──────┘
>>> df.to_pandas().dropna(subset=list_of_vars, thresh=thresh)
col1 col2 col3 col4
0 0.0 0.0 0.0 NaN
3 3.0 3.0 3.0 NaN
We're using polars.sum_horizontal
to summarize row-wise the boolean values produced from the is_not_null
and is_not_nan
Expressions.
dropna
methodPandas has df.dropna. I need to replace this functionality, but I haven't found a dropna in polars.
If you need to, we can create a dropna
function that will work on DataFrames and LazyFrames directly. If this is more than you want/need, you can just skip this and use the code in the section above.
Here's some prototype code for a dropna
for LazyFrames. It's a bit complex, but you should be able to copy/paste this into a module and call it directly on a LazyFrame, as well as tweak the logic to your needs.
from typing import Sequence
import polars.datatypes as datatypes
import polars as pl
def dropna(
self: pl.LazyFrame,
how: str = 'any',
thresh: int = None,
subset: str | Sequence[str] = None,
) -> pl.LazyFrame:
"""
Remove null and NaN values
"""
if subset is None:
subset = pl.all()
else:
subset = pl.col(subset)
if thresh is not None:
result = (
self
.filter(
pl.sum_horizontal(
subset.is_not_null() & subset.is_not_nan()
) >= thresh
)
)
elif how == 'any':
result = (
self
.filter(
pl.all_horizontal(
subset.is_not_null() & subset.is_not_nan()
)
)
)
elif how == 'all':
result = (
self
.filter(
pl.any_horizontal(
subset.is_not_null() & subset.is_not_nan()
)
)
)
else:
...
return self._from_pyldf(result._ldf)
pl.LazyFrame.dropna = dropna
For example, if we have the following data:
df = pl.DataFrame(
{
"col1": [None, float("nan"), 2, 3],
"col2": [None, float("nan"), float("nan"), 3],
"col3": [0.0, float("nan"), 2, 3],
"col4": [float("nan"), float("nan"), float("nan"), 3],
}
)
df
shape: (4, 4)
┌──────┬──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 ┆ col4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪══════╡
│ null ┆ null ┆ 0.0 ┆ NaN │
│ NaN ┆ NaN ┆ NaN ┆ NaN │
│ 2.0 ┆ NaN ┆ 2.0 ┆ NaN │
│ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3.0 │
└──────┴──────┴──────┴──────┘
We can use the dropna
as follows:
list_of_vars = ["col1", "col2"]
threshold = 1
(
df
.lazy()
.dropna(subset=list_of_vars, thresh=threshold)
.collect()
)
df.to_pandas().dropna(subset=list_of_vars, thresh=threshold)
shape: (2, 4)
┌──────┬──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 ┆ col4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪══════╡
│ 2.0 ┆ NaN ┆ 2.0 ┆ NaN │
│ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3.0 │
└──────┴──────┴──────┴──────┘
>>> df.to_pandas().dropna(subset=list_of_vars, thresh=threshold)
col1 col2 col3 col4
2 2.0 NaN 2.0 NaN
3 3.0 3.0 3.0 3.0
And using the default how="any"
parameter on the entire LazyFrame:
(
df
.lazy()
.dropna()
.collect()
)
df.to_pandas().dropna()
shape: (1, 4)
┌──────┬──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 ┆ col4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪══════╡
│ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3.0 │
└──────┴──────┴──────┴──────┘
>>> df.to_pandas().dropna()
col1 col2 col3 col4
3 3.0 3.0 3.0 3.0
And here's the result for a how="all"
on the entire LazyFrame:
(
df
.lazy()
.dropna(how='all')
.collect()
)
df.to_pandas().dropna(how='all')
shape: (3, 4)
┌──────┬──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 ┆ col4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪══════╡
│ null ┆ null ┆ 0.0 ┆ NaN │
│ 2.0 ┆ NaN ┆ 2.0 ┆ NaN │
│ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3.0 │
└──────┴──────┴──────┴──────┘
>>> df.to_pandas().dropna(how='all')
col1 col2 col3 col4
0 NaN NaN 0.0 NaN
2 2.0 NaN 2.0 NaN
3 3.0 3.0 3.0 3.0
For DataFrames, the code simple calls the LazyFrame method.
def dropna_eager(
self: pl.DataFrame,
how: str = 'any',
thresh: int = None,
subset: str | Sequence[str] = None,
) -> pl.DataFrame:
result = (
self
.lazy()
.dropna(how, thresh, subset)
.collect()
)
return self._from_pydf(result._df)
pli.DataFrame.dropna = dropna_eager
So for example, using non-Lazy mode:
list_of_vars = ["col1", "col2"]
(
df
.dropna(subset=list_of_vars, how='all')
)
df.to_pandas().dropna(subset=list_of_vars, how='all')
shape: (2, 4)
┌──────┬──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 ┆ col4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪══════╡
│ 2.0 ┆ NaN ┆ 2.0 ┆ NaN │
│ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3.0 │
└──────┴──────┴──────┴──────┘
>>> df.to_pandas().dropna(subset=list_of_vars, how='all')
col1 col2 col3 col4
2 2.0 NaN 2.0 NaN
3 3.0 3.0 3.0 3.0