I want to remove one field from a struct. Currently, I have it set up like this, but is there a simpler way to achieve this?
import polars as pl
import polars.selectors as cs
def remove_one_field(df: pl.DataFrame) -> pl.DataFrame:
meta_data_columns = (df.select('meta_data')
.unnest('meta_data')
.select(cs.all() - cs.by_name('system_data')).columns)
print(meta_data_columns)
return (df.unnest('meta_data')
.select(cs.all() - cs.by_name('system_data'))
.with_columns(meta_data=pl.struct(meta_data_columns))
.drop(meta_data_columns))
# Example usage
input_df = pl.DataFrame({
"id": [1, 2],
"meta_data": [{"system_data": "to_remove", "user_data": "keep"}, {"user_data": "keep_"}]
})
output_df = remove_one_field(input_df)
print(output_df)
['user_data']
shape: (2, 2)
┌─────┬───────────┐
│ id ┆ meta_data │
│ --- ┆ --- │
│ i64 ┆ struct[1] │
╞═════╪═══════════╡
│ 1 ┆ {"keep"} │
│ 2 ┆ {"keep_"} │
└─────┴───────────┘
Something like select
on fields within a struct?
You can use struct.field()
which can accept either list of strings or multiple string arguments. You know your DataFrame' schema()
so you can easily create list of fields you want
fields = [c[0] for c in input_df.schema["meta_data"] if c[0] != "system_data"]
input_df.with_columns(
meta_data = pl.struct(
pl.col.meta_data.struct.field(fields)
)
)
┌─────┬───────────┐
│ id ┆ meta_data │
│ --- ┆ --- │
│ i64 ┆ struct[1] │
╞═════╪═══════════╡
│ 1 ┆ {"keep"} │
│ 2 ┆ {"keep_"} │
└─────┴───────────┘