I use Pandas wide to long to stack survey data and it works beautifully with regex and stub names, is this possible to do in Polars ?
e.g. in Pandas -
import pandas as pd
df = pd.DataFrame({
'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
})
changed_df = pd.wide_to_long(df,
stubnames='ht',
i=['famid', 'birth'],
j='age',
sep='_',
suffix=r'\w+')
stubnames can take a list as well.
Edit- Added code after taking inspiration from Jqurious -
import pandas as pd
import numpy as np
import polars as pl
import re
# Create age group data
age_groups = np.random.choice(['0-18', '19-35', '36-50', '51-65', '65+'], size=10)
# Create gender data
genders = np.random.choice(['Male', 'Female', 'Other'], size=10)
# Create familiarity and affinity data
fam_aff = np.random.rand(10, 4)
# Create column names
cols = ['Age_group', 'Gender', 'Familiarity_loop1', 'Familiarity_loop2', 'Affinity_loop1', 'Affinity_loop2']
# Combine data into dataframe
data = np.column_stack([age_groups, genders, fam_aff])
df = pd.DataFrame(data=data, columns=cols)
df["unique_records"] = np.arange(len(df))
regex_pattern = '^.*_loop\d'
# get polars DF
pl_df = pl.from_pandas(df)
# get all columns list
col_list = pl_df.columns
loop_list = [] # list of columns which contains _loop
sans_loop_list = [] # list of columns which do not contain _loop
for col in col_list:
if re.search(regex_pattern, col):
loop_list.append(col)
else:
sans_loop_list.append(col)
pl_long_df = (pl_df
.unpivot(
index = pl_df.select(sans_loop_list).columns,
variable_name = "master_stack")
.with_columns(pl.col("master_stack").str.replace(r"_loop\d",""))
)
pl_long_df.pivot(on="master_stack", index=sans_loop_list, values="value", aggregate_function=pl.element())
I want to see Affinity and Familiarity as their own columns, but I am not able to achieve it.
Edit 2 - Added Polars output and Pandas output
If we start with .unpivot()
df.unpivot(index = ["famid", "birth"], variable_name = "age").head(1)
shape: (1, 4)
┌───────┬───────┬────────┬───────┐
│ famid ┆ birth ┆ age ┆ value │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str ┆ f64 │
╞═══════╪═══════╪════════╪═══════╡
│ 1 ┆ 1 ┆ ht_one ┆ 2.8 │
└───────┴───────┴────────┴───────┘
The sep="_"
and suffix=r"\w+"
params used in wide_to_long
are just extracting one
from ht_one
.
One way to do this in Polars could be .str.extract()
df.unpivot(
index = ["famid", "birth"],
variable_name = "age"
).with_columns(
pl.col("age").str.extract(r"_(\w+)$")
)
shape: (18, 4)
┌───────┬───────┬─────┬───────┐
│ famid ┆ birth ┆ age ┆ value │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str ┆ f64 │
╞═══════╪═══════╪═════╪═══════╡
│ 1 ┆ 1 ┆ one ┆ 2.8 │
│ 1 ┆ 2 ┆ one ┆ 2.9 │
│ 1 ┆ 3 ┆ one ┆ 2.2 │
│ 2 ┆ 1 ┆ one ┆ 2.0 │
│ 2 ┆ 2 ┆ one ┆ 1.8 │
│ … ┆ … ┆ … ┆ … │
│ 2 ┆ 2 ┆ two ┆ 2.8 │
│ 2 ┆ 3 ┆ two ┆ 2.4 │
│ 3 ┆ 1 ┆ two ┆ 3.3 │
│ 3 ┆ 2 ┆ two ┆ 3.4 │
│ 3 ┆ 3 ┆ two ┆ 2.9 │
└───────┴───────┴─────┴───────┘
EDIT: As per the updated example:
The pattern I have been using for this is to .unpivot()
and then .pivot()
back.
Find the columns names not ending in the suffix to use as id_vars / index:
suffix = r"_loop\d+$"
id_vars = df.select(pl.exclude("^.+" + suffix)).columns
['Age_group', 'Gender', 'unique_records']
(df.unpivot(index=id_vars)
.with_columns(pl.col("variable").str.replace(suffix, ""))
)
shape: (40, 5)
┌───────────┬────────┬────────────────┬─────────────┬─────────────────────┐
│ Age_group ┆ Gender ┆ unique_records ┆ variable ┆ value │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ i64 ┆ str ┆ str │
╞═══════════╪════════╪════════════════╪═════════════╪═════════════════════╡
│ 19-35 ┆ Female ┆ 0 ┆ Familiarity ┆ 0.9458448571805742 │
│ 65+ ┆ Other ┆ 1 ┆ Familiarity ┆ 0.29898349718902584 │
│ 36-50 ┆ Other ┆ 2 ┆ Familiarity ┆ 0.6698438749905085 │
│ 0-18 ┆ Female ┆ 3 ┆ Familiarity ┆ 0.9589949988835984 │
│ 36-50 ┆ Female ┆ 4 ┆ Familiarity ┆ 0.8738576462244922 │
│ … ┆ … ┆ … ┆ … ┆ … │
│ 0-18 ┆ Female ┆ 5 ┆ Affinity ┆ 0.13593940132707893 │
│ 36-50 ┆ Female ┆ 6 ┆ Affinity ┆ 0.37172205023277705 │
│ 19-35 ┆ Other ┆ 7 ┆ Affinity ┆ 0.5024658713377818 │
│ 51-65 ┆ Other ┆ 8 ┆ Affinity ┆ 0.00582736048275978 │
│ 36-50 ┆ Female ┆ 9 ┆ Affinity ┆ 0.34380158652767634 │
└───────────┴────────┴────────────────┴─────────────┴─────────────────────┘
We end up with 40 rows and 2 variables (Familiarity, Affinity).
In order to pivot into 20 rows, you can add a "row number" per variable and use it as part of the index.
(df.unpivot(index=id_vars)
.with_columns(pl.col("variable").str.replace(suffix, ""))
.with_columns(index = pl.int_range(pl.len()).over("variable"))
.pivot(on="variable", index=id_vars + ["index"])
)
shape: (20, 6)
┌───────────┬────────┬────────────────┬───────┬─────────────────────┬─────────────────────┐
│ Age_group ┆ Gender ┆ unique_records ┆ index ┆ Familiarity ┆ Affinity │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ i64 ┆ i64 ┆ str ┆ str │
╞═══════════╪════════╪════════════════╪═══════╪═════════════════════╪═════════════════════╡
│ 19-35 ┆ Female ┆ 0 ┆ 0 ┆ 0.9458448571805742 ┆ 0.8318885018762573 │
│ 65+ ┆ Other ┆ 1 ┆ 1 ┆ 0.29898349718902584 ┆ 0.5932787653850062 │
│ 36-50 ┆ Other ┆ 2 ┆ 2 ┆ 0.6698438749905085 ┆ 0.3322678195709319 │
│ 0-18 ┆ Female ┆ 3 ┆ 3 ┆ 0.9589949988835984 ┆ 0.2252757821730993 │
│ 36-50 ┆ Female ┆ 4 ┆ 4 ┆ 0.8738576462244922 ┆ 0.42281089740408706 │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ 0-18 ┆ Female ┆ 5 ┆ 15 ┆ 0.17803848283413837 ┆ 0.13593940132707893 │
│ 36-50 ┆ Female ┆ 6 ┆ 16 ┆ 0.5390844456218246 ┆ 0.37172205023277705 │
│ 19-35 ┆ Other ┆ 7 ┆ 17 ┆ 0.7692067698388259 ┆ 0.5024658713377818 │
│ 51-65 ┆ Other ┆ 8 ┆ 18 ┆ 0.6569518159892904 ┆ 0.00582736048275978 │
│ 36-50 ┆ Female ┆ 9 ┆ 19 ┆ 0.6946040879238368 ┆ 0.34380158652767634 │
└───────────┴────────┴────────────────┴───────┴─────────────────────┴─────────────────────┘