Search code examples
pythonstringpython-polarscamelcasingsnakecasing

Polars - Replace letter in string with uppercase letter


Is there any way in polars to replace character just after the _ with uppercase using regex replace? So far I have achieved it using polars.Expr.map_elements.

Is there any alternative using native expression API?

import re
import polars as pl

# Initialize
df = pl.DataFrame(
    {
        "id": [
            "accessible_bidding_strategy.id",
            "accessible_bidding_strategy.name",
            "accessible_bidding_strategy.owner_customer_id",
        ]
    }
)

# Transform
df = df.with_columns(
    pl.col("id")
    .map_elements(
        lambda val: re.sub(r"_\w", lambda match: match.group(0)[1].upper(), val),
        return_dtype=pl.String,
    )
    .alias("parsed_id")
)


print(df)

Output

shape: (3, 2)
┌───────────────────────────────────────────────┬───────────────────────────────────────────┐
│ id                                            ┆ parsed_id                                 │
│ ---                                           ┆ ---                                       │
│ str                                           ┆ str                                       │
╞═══════════════════════════════════════════════╪═══════════════════════════════════════════╡
│ accessible_bidding_strategy.id                ┆ accessibleBiddingStrategy.id              │
│ accessible_bidding_strategy.name              ┆ accessibleBiddingStrategy.name            │
│ accessible_bidding_strategy.owner_customer_id ┆ accessibleBiddingStrategy.ownerCustomerId │
└───────────────────────────────────────────────┴───────────────────────────────────────────┘

Solution

  • I don't think it's possible to "dynamically" modify the replacement in any of the Polars replace methods.

    You could create all possible mappings and use .str.replace_many()

    import string 
    
    pl.Config(fmt_table_cell_list_len=10, fmt_str_lengths=120)
    
    df.with_columns(
        pl.col("id").str.replace_many(
            [f"_{c}" for c in string.ascii_lowercase],
            [f"_{c}" for c in string.ascii_uppercase],
        )
        .str.replace_all("_", "")
        .alias("parsed_id")
    )
    
    shape: (3, 2)
    ┌───────────────────────────────────────────────┬───────────────────────────────────────────┐
    │ id                                            ┆ parsed_id                                 │
    │ ---                                           ┆ ---                                       │
    │ str                                           ┆ str                                       │
    ╞═══════════════════════════════════════════════╪═══════════════════════════════════════════╡
    │ accessible_bidding_strategy.id                ┆ accessibleBiddingStrategy.id              │
    │ accessible_bidding_strategy.name              ┆ accessibleBiddingStrategy.name            │
    │ accessible_bidding_strategy.owner_customer_id ┆ accessibleBiddingStrategy.ownerCustomerId │
    └───────────────────────────────────────────────┴───────────────────────────────────────────┘
    

    Otherwise you'd probably need some form of .str.split() and .list.eval()

    df.with_columns(
        pl.col("id").str.split("_").list.eval(
            pl.element().first() + (
                pl.element().slice(1).str.slice(0, 1).str.to_uppercase() 
                +
                pl.element().slice(1).str.slice(1)
            )
            .str.join()
        )
        .list.first()
        .alias("parsed_id")
    )