Customize column names and add new column based on value during pivot operation in Polars?

Say, I have a polars dataframe as below:

df = pl.from_repr("""
┌──────┬────────┬──────────────┬────────────────┬─────────────┬─────┐
│ date ┆ ticker ┆ positionType ┆ predictionType ┆ holdingBars ┆ res │
│ ---  ┆ ---    ┆ ---          ┆ ---            ┆ ---         ┆ --- │
│ str  ┆ str    ┆ str          ┆ str            ┆ i64         ┆ i64 │
╞══════╪════════╪══════════════╪════════════════╪═════════════╪═════╡
│ d1   ┆ t1     ┆ Long         ┆ p1             ┆ 1           ┆ 1   │
│ d1   ┆ t1     ┆ Long         ┆ s1             ┆ 1           ┆ 2   │
│ d1   ┆ t1     ┆ Short        ┆ p1             ┆ 1           ┆ 3   │
│ d1   ┆ t1     ┆ Short        ┆ s1             ┆ 1           ┆ 4   │
│ d1   ┆ t1     ┆ Long         ┆ p1             ┆ 2           ┆ 5   │
│ d1   ┆ t1     ┆ Short        ┆ p1             ┆ 2           ┆ 6   │
└──────┴────────┴──────────────┴────────────────┴─────────────┴─────┘
""")

I have tried pivoting it as below:

df.pivot(
    on=["positionType", "holdingBars"],
    index=["date", "ticker", "predictionType"],      
    values="res"
)

Output:

shape: (2, 7)
┌──────┬────────┬────────────────┬────────────┬─────────────┬────────────┬─────────────┐
│ date ┆ ticker ┆ predictionType ┆ {"Long",1} ┆ {"Short",1} ┆ {"Long",2} ┆ {"Short",2} │
│ ---  ┆ ---    ┆ ---            ┆ ---        ┆ ---         ┆ ---        ┆ ---         │
│ str  ┆ str    ┆ str            ┆ i64        ┆ i64         ┆ i64        ┆ i64         │
╞══════╪════════╪════════════════╪════════════╪═════════════╪════════════╪═════════════╡
│ d1   ┆ t1     ┆ p1             ┆ 1          ┆ 3           ┆ 5          ┆ 6           │
│ d1   ┆ t1     ┆ s1             ┆ 2          ┆ 4           ┆ null       ┆ null        │
└──────┴────────┴────────────────┴────────────┴─────────────┴────────────┴─────────────┘

I would like the column names to be Long_1, Short_1, etc.
I would also like to add additional columns: Long_1/Short_1 and Long_2/Short_2

shape: (2, 9)
┌──────┬────────┬────────────────┬────────┬─────────┬────────┬─────────┬──────────────────┬──────────────────┐
│ date ┆ ticker ┆ predictionType ┆ Long_1 ┆ Short_1 ┆ Long_2 ┆ Short_2 ┆ Long_1 / Short_1 ┆ Long_2 / Short_2 │
│ ---  ┆ ---    ┆ ---            ┆ ---    ┆ ---     ┆ ---    ┆ ---     ┆ ---              ┆ ---              │
│ str  ┆ str    ┆ str            ┆ i64    ┆ i64     ┆ str    ┆ str     ┆ f64              ┆ f64              │
╞══════╪════════╪════════════════╪════════╪═════════╪════════╪═════════╪══════════════════╪══════════════════╡
│ d1   ┆ t1     ┆ p1             ┆ 1      ┆ 3       ┆ 5      ┆ 6       ┆ 0.3              ┆ 0.83             │
│ d1   ┆ t1     ┆ s1             ┆ 2      ┆ 4       ┆ null   ┆ null    ┆ 0.5              ┆ null             │
└──────┴────────┴────────────────┴────────┴─────────┴────────┴─────────┴──────────────────┴──────────────────┘

Please note that this is a subset of data, it can have multiple tickers across multiple dates and res can be across multiple holding periods.

Any idea how an achieve these, please?

Solution

You can create the column names before .pivot() e.g. using pl.format()

(df.with_columns(pl.format("{}_{}", "positionType", "holdingBars").alias("on"))
   .pivot(
      on = "on",
      index = ["date", "ticker", "predictionType"],
      values = "res"
   )
)

shape: (2, 7)
┌──────┬────────┬────────────────┬────────┬─────────┬────────┬─────────┐
│ date ┆ ticker ┆ predictionType ┆ Long_1 ┆ Short_1 ┆ Long_2 ┆ Short_2 │
│ ---  ┆ ---    ┆ ---            ┆ ---    ┆ ---     ┆ ---    ┆ ---     │
│ str  ┆ str    ┆ str            ┆ i64    ┆ i64     ┆ i64    ┆ i64     │
╞══════╪════════╪════════════════╪════════╪═════════╪════════╪═════════╡
│ d1   ┆ t1     ┆ p1             ┆ 1      ┆ 3       ┆ 5      ┆ 6       │
│ d1   ┆ t1     ┆ s1             ┆ 2      ┆ 4       ┆ null   ┆ null    │
└──────┴────────┴────────────────┴────────┴─────────┴────────┴─────────┘

Long_1/Short_1

As with the names, you could calculate the values before .pivot()

As the Long/Short items are guaranteed pairs, and in order - you could assign an "id" to each pair, e.g. using .cum_sum()

df.with_columns(
   (pl.col("positionType") == "Long")
      .cum_sum() 
      .over("ticker", "predictionType")
      .alias("predictionId")
)

shape: (6, 7)
┌──────┬────────┬──────────────┬────────────────┬─────────────┬─────┬──────────────┐
│ date ┆ ticker ┆ positionType ┆ predictionType ┆ holdingBars ┆ res ┆ predictionId │
│ ---  ┆ ---    ┆ ---          ┆ ---            ┆ ---         ┆ --- ┆ ---          │
│ str  ┆ str    ┆ str          ┆ str            ┆ i64         ┆ i64 ┆ u32          │
╞══════╪════════╪══════════════╪════════════════╪═════════════╪═════╪══════════════╡
│ d1   ┆ t1     ┆ Long         ┆ p1             ┆ 1           ┆ 1   ┆ 1            │
│ d1   ┆ t1     ┆ Long         ┆ s1             ┆ 1           ┆ 2   ┆ 1            │
│ d1   ┆ t1     ┆ Short        ┆ p1             ┆ 1           ┆ 3   ┆ 1            │
│ d1   ┆ t1     ┆ Short        ┆ s1             ┆ 1           ┆ 4   ┆ 1            │
│ d1   ┆ t1     ┆ Long         ┆ p1             ┆ 2           ┆ 5   ┆ 2            │
│ d1   ┆ t1     ┆ Short        ┆ p1             ┆ 2           ┆ 6   ┆ 2            │
└──────┴────────┴──────────────┴────────────────┴─────────────┴─────┴──────────────┘

And use this with .group_by() to .agg() the computations.

(df
  .with_columns(
     (pl.col("positionType") == "Long").cum_sum().over("ticker", "predictionType")
        .alias("predictionId")
  )
  .group_by("date", "ticker", "predictionType", "predictionId", maintain_order=True)
  .agg(
      pl.col("res").first().alias("Long"),
      pl.col("res").last().alias("Short"),
      (pl.col("res").first() / pl.col("res").last()).alias("Long/Short")
  )
)

shape: (3, 7)
┌──────┬────────┬────────────────┬────────┬──────┬───────┬────────────┐
│ date ┆ ticker ┆ predictionType ┆ predId ┆ Long ┆ Short ┆ Long/Short │
│ ---  ┆ ---    ┆ ---            ┆ ---    ┆ ---  ┆ ---   ┆ ---        │
│ str  ┆ str    ┆ str            ┆ u32    ┆ i64  ┆ i64   ┆ f64        │
╞══════╪════════╪════════════════╪════════╪══════╪═══════╪════════════╡
│ d1   ┆ t1     ┆ p1             ┆ 1      ┆ 1    ┆ 3     ┆ 0.333333   │
│ d1   ┆ t1     ┆ s1             ┆ 1      ┆ 2    ┆ 4     ┆ 0.5        │
│ d1   ┆ t1     ┆ p1             ┆ 2      ┆ 5    ┆ 6     ┆ 0.833333   │
└──────┴────────┴────────────────┴────────┴──────┴───────┴────────────┘

If we .unpivot() this result, we end up with the same format/structure as the first example.

Which means you can add the same .format() + .pivot() steps.

(df
  .with_columns(
     (pl.col("positionType") == "Long").cum_sum().over("ticker", "predictionType")
        .alias("predictionId")
  )
  .group_by("date", "ticker", "predictionType", "predictionId", maintain_order=True)
  .agg(
      pl.col("res").first().alias("Long"),
      pl.col("res").last().alias("Short"),
      (pl.col("res").first() / pl.col("res").last()).alias("Long/Short")
  )
  .unpivot(index=["date", "ticker", "predictionType", "predictionId"])
  .with_columns(pl.format("{}_{}", "variable", "predictionId"))
  .pivot(
     on = "variable",
     index = ["date", "ticker", "predictionType"],
     values = "value"
  )
)

shape: (2, 9)
┌──────┬────────┬────────────────┬────────┬────────┬─────────┬─────────┬──────────────┬──────────────┐
│ date ┆ ticker ┆ predictionType ┆ Long_1 ┆ Long_2 ┆ Short_1 ┆ Short_2 ┆ Long/Short_1 ┆ Long/Short_2 │
│ ---  ┆ ---    ┆ ---            ┆ ---    ┆ ---    ┆ ---     ┆ ---     ┆ ---          ┆ ---          │
│ str  ┆ str    ┆ str            ┆ f64    ┆ f64    ┆ f64     ┆ f64     ┆ f64          ┆ f64          │
╞══════╪════════╪════════════════╪════════╪════════╪═════════╪═════════╪══════════════╪══════════════╡
│ d1   ┆ t1     ┆ p1             ┆ 1.0    ┆ 5.0    ┆ 3.0     ┆ 6.0     ┆ 0.333333     ┆ 0.833333     │
│ d1   ┆ t1     ┆ s1             ┆ 2.0    ┆ null   ┆ 4.0     ┆ null    ┆ 0.5          ┆ null         │
└──────┴────────┴────────────────┴────────┴────────┴─────────┴─────────┴──────────────┴──────────────┘