Search code examples
dataframepython-polars

Column descriptions in Polars


I have a Polars dataframe. I would like to add some column descriptions in the file of the dataframe, without changing the name of the columns. For pandas the following solution works pretty good. I have not found something equivalent which works with

import pandas as pd
import numpy as np

# Creating a DataFrame
data = {
    'Column1': np.random.rand(5),
    'Column2': np.random.rand(5),
    'Column3': np.random.rand(5)
}
df = pd.DataFrame(data)

# Column descriptions
column_desc = {
    'Column1': 'Description for Column 1',
    'Column2': 'Description for Column 2',
    'Column3': 'Description for Column 3'
}

# Add descriptions to DataFrame
df.columns = pd.MultiIndex.from_tuples([(col, column_desc.get(col)) for col in df.columns])

# Check DataFrame columns with descriptions
print(df.columns)

Solution

  • You can't do this with polars but you can do it with pyarrow. Here are a couple custom functions you can use

    import polars as pl
    import pyarrow as pa
    import pyarrow.parquet as pq
    import numpy as np
    
    def write_w_meta(df, where, meta):
        df_tab=df.to_arrow()
        df_tab=df_tab.replace_schema_metadata(meta)
        pq.write_table(df_tab, where)
    
    def read_w_meta(where) -> (pl.DataFrame, dict):
        """returns a tuple where the first value is a polars DataFrame and the 
        second is an embedded column description that was created with write_w_meta"""
        df_tab = pq.read_table(where)
        meta = df_tab.schema.metadata
        meta = {x.decode('utf-8'):y.decode('utf-8') for x,y in meta.items()}
        return (pl.from_arrow(df_tab), meta)
    

    With those you can now do:

    column_desc = {
        'Column1': 'Description for Column 1',
        'Column2': 'Description for Column 2',
        'Column3': 'Description for Column 3'
    }
    
    # Creating a DataFrame
    data = {
        'Column1': np.random.rand(5),
        'Column2': np.random.rand(5),
        'Column3': np.random.rand(5)
    }
    df = pl.DataFrame(data)
    
    
    write_w_meta(df, "some_file.parquet", column_desc)
    
    new_df, new_desc = read_w_meta("some_file.parquet")
    
    print(new_df)
    shape: (5, 3)
    ┌──────────┬──────────┬──────────┐
    │ Column1  ┆ Column2  ┆ Column3  │
    │ ---      ┆ ---      ┆ ---      │
    │ f64      ┆ f64      ┆ f64      │
    ╞══════════╪══════════╪══════════╡
    │ 0.782398 ┆ 0.496273 ┆ 0.531805 │
    │ 0.025615 ┆ 0.106782 ┆ 0.157827 │
    │ 0.69775  ┆ 0.75612  ┆ 0.20301  │
    │ 0.004942 ┆ 0.112314 ┆ 0.005737 │
    │ 0.253238 ┆ 0.111861 ┆ 0.056216 │
    └──────────┴──────────┴──────────┘
    
    print(new_desc)
    {'Column1': 'Description for Column 1',
     'Column2': 'Description for Column 2',
     'Column3': 'Description for Column 3'}