I cannot get my head around this. Say that I have a dictionary of regexes and replacement strings that I want to replace, and if none of these regexes are matched (when returns False) I want to resume with the next when statement.
So, instead of this way, that does not check for data in "new data"...
all_items = pl.DataFrame(
{
"data": ["Swedish fish", "English tea", "", "", ""],
"ISO_codes": ["fin", "nor", "eng", "eng", "swe"],
})
replacement_rules = {
r"^Swe.*": "Svenska",
r"^Eng.*": "English",
}
iso_tranlation = {
"swe": "Svenska",
"eng": "English",
"nor": "Norsk",
"fin": "Finska på finska",
}
for pattern, replacement in replacement_rules.items():
all_items = (
all_items.lazy()
.with_columns(
pl.when(pl.col("data").str.contains(pattern))
.then(pl.lit(replacement))
.alias("new_data")
)
.collect()
)
all_items = (
all_items.lazy()
.with_columns(
pl.when(pl.col("ISO_codes").str.len_chars() > 0)
.then(
pl.col("ISO_codes")
.replace(iso_tranlation , default="Unknown ISO Code")
)
.alias("new_data")
)
.collect()
)
...i would like to do something along the lines of this:
expressions = [
pl.when(pl.col("data").str.contains(pattern))
.then(pl.lit(replacement))
for pattern, replacement in replacement_rules.items()]
all_items = (
all_items.lazy()
.with_columns(
expressions.explode_and_pipe()
.when(pl.col("ISO_codes").str.len_chars() > 0)
.then(
pl.col("ISO_codes")
.replace(iso_tranlation , default="Unknown ISO Code")
)
.alias("new_data")
)
.collect()
)
Is there a way to achieve that expressions.explode_and_pipe()
?
EDIT: This is the resulting dataframe i'm after:
shape: (5, 3)
┌──────────────┬───────────┬──────────┐
│ data ┆ ISO_codes ┆ new_data │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞══════════════╪═══════════╪══════════╡
│ Swedish fish ┆ fin ┆ Svenska │
│ English tea ┆ nor ┆ English │
│ ┆ eng ┆ English │
│ ┆ eng ┆ English │
│ ┆ swe ┆ Svenska │
└──────────────┴───────────┴──────────┘
Maybe you are looking for something like this? (coalesce
may be the function you want.)
replacements = pl.lit(None)
for pattern, replacement in replacement_rules.items():
replacements = (
pl.when(pl.col("data").str.contains(pattern))
.then(pl.lit(replacement))
.otherwise(replacements)
)
iso_translations = pl.col("ISO_codes").replace(
iso_translation, default="Unknown ISO Code"
)
all_items.with_columns(new_data=pl.coalesce(replacements, iso_translations))
shape: (5, 3)
┌──────────────┬───────────┬──────────┐
│ data ┆ ISO_codes ┆ new_data │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞══════════════╪═══════════╪══════════╡
│ Swedish fish ┆ fin ┆ Svenska │
│ English tea ┆ nor ┆ English │
│ ┆ eng ┆ English │
│ ┆ eng ┆ English │
│ ┆ swe ┆ Svenska │
└──────────────┴───────────┴──────────┘