Search code examples
dataframepython-polars

Customize column names and add new column based on value during pivot operation in Polars?


Say, I have a polars dataframe as below:

df = pl.from_repr("""
┌──────┬────────┬──────────────┬────────────────┬─────────────┬─────┐
│ date ┆ ticker ┆ positionType ┆ predictionType ┆ holdingBars ┆ res │
│ ---  ┆ ---    ┆ ---          ┆ ---            ┆ ---         ┆ --- │
│ str  ┆ str    ┆ str          ┆ str            ┆ i64         ┆ i64 │
╞══════╪════════╪══════════════╪════════════════╪═════════════╪═════╡
│ d1   ┆ t1     ┆ Long         ┆ p1             ┆ 1           ┆ 1   │
│ d1   ┆ t1     ┆ Long         ┆ s1             ┆ 1           ┆ 2   │
│ d1   ┆ t1     ┆ Short        ┆ p1             ┆ 1           ┆ 3   │
│ d1   ┆ t1     ┆ Short        ┆ s1             ┆ 1           ┆ 4   │
│ d1   ┆ t1     ┆ Long         ┆ p1             ┆ 2           ┆ 5   │
│ d1   ┆ t1     ┆ Short        ┆ p1             ┆ 2           ┆ 6   │
└──────┴────────┴──────────────┴────────────────┴─────────────┴─────┘
""")

I have tried pivoting it as below:

df.pivot(
    on=["positionType", "holdingBars"],
    index=["date", "ticker", "predictionType"],      
    values="res"
)

Output:

shape: (2, 7)
┌──────┬────────┬────────────────┬────────────┬─────────────┬────────────┬─────────────┐
│ date ┆ ticker ┆ predictionType ┆ {"Long",1} ┆ {"Short",1} ┆ {"Long",2} ┆ {"Short",2} │
│ ---  ┆ ---    ┆ ---            ┆ ---        ┆ ---         ┆ ---        ┆ ---         │
│ str  ┆ str    ┆ str            ┆ i64        ┆ i64         ┆ i64        ┆ i64         │
╞══════╪════════╪════════════════╪════════════╪═════════════╪════════════╪═════════════╡
│ d1   ┆ t1     ┆ p1             ┆ 1          ┆ 3           ┆ 5          ┆ 6           │
│ d1   ┆ t1     ┆ s1             ┆ 2          ┆ 4           ┆ null       ┆ null        │
└──────┴────────┴────────────────┴────────────┴─────────────┴────────────┴─────────────┘
  1. I would like the column names to be Long_1, Short_1, etc.
  2. I would also like to add additional columns: Long_1/Short_1 and Long_2/Short_2
shape: (2, 9)
┌──────┬────────┬────────────────┬────────┬─────────┬────────┬─────────┬──────────────────┬──────────────────┐
│ date ┆ ticker ┆ predictionType ┆ Long_1 ┆ Short_1 ┆ Long_2 ┆ Short_2 ┆ Long_1 / Short_1 ┆ Long_2 / Short_2 │
│ ---  ┆ ---    ┆ ---            ┆ ---    ┆ ---     ┆ ---    ┆ ---     ┆ ---              ┆ ---              │
│ str  ┆ str    ┆ str            ┆ i64    ┆ i64     ┆ str    ┆ str     ┆ f64              ┆ f64              │
╞══════╪════════╪════════════════╪════════╪═════════╪════════╪═════════╪══════════════════╪══════════════════╡
│ d1   ┆ t1     ┆ p1             ┆ 1      ┆ 3       ┆ 5      ┆ 6       ┆ 0.3              ┆ 0.83             │
│ d1   ┆ t1     ┆ s1             ┆ 2      ┆ 4       ┆ null   ┆ null    ┆ 0.5              ┆ null             │
└──────┴────────┴────────────────┴────────┴─────────┴────────┴─────────┴──────────────────┴──────────────────┘

Please note that this is a subset of data, it can have multiple tickers across multiple dates and res can be across multiple holding periods.

Any idea how an achieve these, please?


Solution

  • You can create the column names before .pivot() e.g. using pl.format()

    (df.with_columns(pl.format("{}_{}", "positionType", "holdingBars").alias("on"))
       .pivot(
          on = "on",
          index = ["date", "ticker", "predictionType"],
          values = "res"
       )
    )
    
    shape: (2, 7)
    ┌──────┬────────┬────────────────┬────────┬─────────┬────────┬─────────┐
    │ date ┆ ticker ┆ predictionType ┆ Long_1 ┆ Short_1 ┆ Long_2 ┆ Short_2 │
    │ ---  ┆ ---    ┆ ---            ┆ ---    ┆ ---     ┆ ---    ┆ ---     │
    │ str  ┆ str    ┆ str            ┆ i64    ┆ i64     ┆ i64    ┆ i64     │
    ╞══════╪════════╪════════════════╪════════╪═════════╪════════╪═════════╡
    │ d1   ┆ t1     ┆ p1             ┆ 1      ┆ 3       ┆ 5      ┆ 6       │
    │ d1   ┆ t1     ┆ s1             ┆ 2      ┆ 4       ┆ null   ┆ null    │
    └──────┴────────┴────────────────┴────────┴─────────┴────────┴─────────┘
    

    Long_1/Short_1

    As with the names, you could calculate the values before .pivot()

    As the Long/Short items are guaranteed pairs, and in order - you could assign an "id" to each pair, e.g. using .cum_sum()

    df.with_columns(
       (pl.col("positionType") == "Long")
          .cum_sum() 
          .over("ticker", "predictionType")
          .alias("predictionId")
    )
    
    shape: (6, 7)
    ┌──────┬────────┬──────────────┬────────────────┬─────────────┬─────┬──────────────┐
    │ date ┆ ticker ┆ positionType ┆ predictionType ┆ holdingBars ┆ res ┆ predictionId │
    │ ---  ┆ ---    ┆ ---          ┆ ---            ┆ ---         ┆ --- ┆ ---          │
    │ str  ┆ str    ┆ str          ┆ str            ┆ i64         ┆ i64 ┆ u32          │
    ╞══════╪════════╪══════════════╪════════════════╪═════════════╪═════╪══════════════╡
    │ d1   ┆ t1     ┆ Long         ┆ p1             ┆ 1           ┆ 1   ┆ 1            │
    │ d1   ┆ t1     ┆ Long         ┆ s1             ┆ 1           ┆ 2   ┆ 1            │
    │ d1   ┆ t1     ┆ Short        ┆ p1             ┆ 1           ┆ 3   ┆ 1            │
    │ d1   ┆ t1     ┆ Short        ┆ s1             ┆ 1           ┆ 4   ┆ 1            │
    │ d1   ┆ t1     ┆ Long         ┆ p1             ┆ 2           ┆ 5   ┆ 2            │
    │ d1   ┆ t1     ┆ Short        ┆ p1             ┆ 2           ┆ 6   ┆ 2            │
    └──────┴────────┴──────────────┴────────────────┴─────────────┴─────┴──────────────┘
    

    And use this with .group_by() to .agg() the computations.

    (df
      .with_columns(
         (pl.col("positionType") == "Long").cum_sum().over("ticker", "predictionType")
            .alias("predictionId")
      )
      .group_by("date", "ticker", "predictionType", "predictionId", maintain_order=True)
      .agg(
          pl.col("res").first().alias("Long"),
          pl.col("res").last().alias("Short"),
          (pl.col("res").first() / pl.col("res").last()).alias("Long/Short")
      )
    )
    
    shape: (3, 7)
    ┌──────┬────────┬────────────────┬────────┬──────┬───────┬────────────┐
    │ date ┆ ticker ┆ predictionType ┆ predId ┆ Long ┆ Short ┆ Long/Short │
    │ ---  ┆ ---    ┆ ---            ┆ ---    ┆ ---  ┆ ---   ┆ ---        │
    │ str  ┆ str    ┆ str            ┆ u32    ┆ i64  ┆ i64   ┆ f64        │
    ╞══════╪════════╪════════════════╪════════╪══════╪═══════╪════════════╡
    │ d1   ┆ t1     ┆ p1             ┆ 1      ┆ 1    ┆ 3     ┆ 0.333333   │
    │ d1   ┆ t1     ┆ s1             ┆ 1      ┆ 2    ┆ 4     ┆ 0.5        │
    │ d1   ┆ t1     ┆ p1             ┆ 2      ┆ 5    ┆ 6     ┆ 0.833333   │
    └──────┴────────┴────────────────┴────────┴──────┴───────┴────────────┘
    

    If we .unpivot() this result, we end up with the same format/structure as the first example.

    Which means you can add the same .format() + .pivot() steps.

    (df
      .with_columns(
         (pl.col("positionType") == "Long").cum_sum().over("ticker", "predictionType")
            .alias("predictionId")
      )
      .group_by("date", "ticker", "predictionType", "predictionId", maintain_order=True)
      .agg(
          pl.col("res").first().alias("Long"),
          pl.col("res").last().alias("Short"),
          (pl.col("res").first() / pl.col("res").last()).alias("Long/Short")
      )
      .unpivot(index=["date", "ticker", "predictionType", "predictionId"])
      .with_columns(pl.format("{}_{}", "variable", "predictionId"))
      .pivot(
         on = "variable",
         index = ["date", "ticker", "predictionType"],
         values = "value"
      )
    )
    
    shape: (2, 9)
    ┌──────┬────────┬────────────────┬────────┬────────┬─────────┬─────────┬──────────────┬──────────────┐
    │ date ┆ ticker ┆ predictionType ┆ Long_1 ┆ Long_2 ┆ Short_1 ┆ Short_2 ┆ Long/Short_1 ┆ Long/Short_2 │
    │ ---  ┆ ---    ┆ ---            ┆ ---    ┆ ---    ┆ ---     ┆ ---     ┆ ---          ┆ ---          │
    │ str  ┆ str    ┆ str            ┆ f64    ┆ f64    ┆ f64     ┆ f64     ┆ f64          ┆ f64          │
    ╞══════╪════════╪════════════════╪════════╪════════╪═════════╪═════════╪══════════════╪══════════════╡
    │ d1   ┆ t1     ┆ p1             ┆ 1.0    ┆ 5.0    ┆ 3.0     ┆ 6.0     ┆ 0.333333     ┆ 0.833333     │
    │ d1   ┆ t1     ┆ s1             ┆ 2.0    ┆ null   ┆ 4.0     ┆ null    ┆ 0.5          ┆ null         │
    └──────┴────────┴────────────────┴────────┴────────┴─────────┴─────────┴──────────────┴──────────────┘