consider this dummy dataset,
import numpy as np
import pandas as pd
import polars as pl
np.random.seed(25)
num_rows = 6
data = {
'item_id': np.random.choice(['A', 'B'], num_rows),
'store_id': np.random.choice([1, 2], num_rows),
'sold_quantity': np.random.randint(0, 5, num_rows),
'total_sku_count': np.random.choice([0, 1], num_rows),
'netsales': np.random.choice([50,100], num_rows)
}
df = pd.DataFrame(data)
Main dataframe,
# pl.from_pandas(df.reset_index())
shape: (6, 6)
┌───────┬─────────┬──────────┬───────────────┬─────────────────┬──────────┐
│ index ┆ item_id ┆ store_id ┆ sold_quantity ┆ total_sku_count ┆ netsales │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═══════╪═════════╪══════════╪═══════════════╪═════════════════╪══════════╡
│ 0 ┆ A ┆ 2 ┆ 4 ┆ 0 ┆ 100 │
│ 1 ┆ A ┆ 2 ┆ 4 ┆ 1 ┆ 50 │
│ 2 ┆ A ┆ 1 ┆ 1 ┆ 1 ┆ 100 │
│ 3 ┆ B ┆ 1 ┆ 4 ┆ 0 ┆ 50 │
│ 4 ┆ B ┆ 2 ┆ 1 ┆ 1 ┆ 100 │
│ 5 ┆ A ┆ 1 ┆ 3 ┆ 1 ┆ 100 │
└───────┴─────────┴──────────┴───────────────┴─────────────────┴──────────┘
Now i'm creating a filtered df, with rows containing only "item_id"= "A" using ".iloc".Then, i'll change 'netsales' value to 120 for all rows,
sub_df = df.loc[df["item_id"]=="A"]
sub_df['netsales'] = 120
This is the filtered df with changed value,
# pl.from_pandas(sub_df.reset_index())
shape: (4, 6)
┌───────┬─────────┬──────────┬───────────────┬─────────────────┬──────────┐
│ index ┆ item_id ┆ store_id ┆ sold_quantity ┆ total_sku_count ┆ netsales │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═══════╪═════════╪══════════╪═══════════════╪═════════════════╪══════════╡
│ 0 ┆ A ┆ 2 ┆ 4 ┆ 0 ┆ 120 │
│ 1 ┆ A ┆ 2 ┆ 4 ┆ 1 ┆ 120 │
│ 2 ┆ A ┆ 1 ┆ 1 ┆ 1 ┆ 120 │
│ 5 ┆ A ┆ 1 ┆ 3 ┆ 1 ┆ 120 │
└───────┴─────────┴──────────┴───────────────┴─────────────────┴──────────┘
Now i can replace the rows of main df ,with filtered df using below line of code on same location.
df.loc[df["item_id"]=="A"] = sub_df
This is the final df,
# pl.from_pandas(df)
shape: (6, 5)
┌─────────┬──────────┬───────────────┬─────────────────┬──────────┐
│ item_id ┆ store_id ┆ sold_quantity ┆ total_sku_count ┆ netsales │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════════╪══════════╪═══════════════╪═════════════════╪══════════╡
│ A ┆ 2 ┆ 4 ┆ 0 ┆ 120 │
│ A ┆ 2 ┆ 4 ┆ 1 ┆ 120 │
│ A ┆ 1 ┆ 1 ┆ 1 ┆ 120 │
│ B ┆ 1 ┆ 4 ┆ 0 ┆ 50 │
│ B ┆ 2 ┆ 1 ┆ 1 ┆ 100 │
│ A ┆ 1 ┆ 3 ┆ 1 ┆ 120 │
└─────────┴──────────┴───────────────┴─────────────────┴──────────┘
I actually wanted to perform the same operation in Polars. I've attempted to implement this using the 'filter' method of Polars, but I was unsuccessful due to my inexperience with the polars.Please help me if there is a way to achieve this.Your support is much appreciated.
There are a couple ways that you can accomplish your task. IIUC there might be two questions here:
import numpy as np
import polars as pl
np.random.seed(25)
num_rows = 6
data = {
'item_id': np.random.choice(['A', 'B'], num_rows),
'store_id': np.random.choice([1, 2], num_rows),
'sold_quantity': np.random.randint(0, 5, num_rows),
'total_sku_count': np.random.choice([0, 1], num_rows),
'netsales': np.random.choice([50,100], num_rows)
}
pl_df = pl.DataFrame(data).lazy() # We'll use a LazyFrame
pl_df = pl_df.with_columns(
netsales=pl.when(pl.col('item_id') == 'A').then(120).otherwise(pl.col('netsales'))
)
print(pl_df.collect())
# shape: (6, 5)
# ┌─────────┬──────────┬───────────────┬─────────────────┬──────────┐
# │ item_id ┆ store_id ┆ sold_quantity ┆ total_sku_count ┆ netsales │
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
# ╞═════════╪══════════╪═══════════════╪═════════════════╪══════════╡
# │ A ┆ 2 ┆ 4 ┆ 0 ┆ 120 │
# │ A ┆ 2 ┆ 4 ┆ 1 ┆ 120 │
# │ A ┆ 1 ┆ 1 ┆ 1 ┆ 120 │
# │ B ┆ 1 ┆ 4 ┆ 0 ┆ 50 │
# │ B ┆ 2 ┆ 1 ┆ 1 ┆ 100 │
# │ A ┆ 1 ┆ 3 ┆ 1 ┆ 120 │
# └─────────┴──────────┴───────────────┴─────────────────┴──────────┘
The above complets your operation, however you might also be interested in changing many columns above in accordance with a subframe.
You can partition your DataFrame based on your computed mask, where the in-group
is df.filter(mask)
and the remaining group would be df.filter(~mask)
. This
way you can make changes to the in-group and the concatenate the two parts back
together. We’ll also need to track the row numbers to restore the original ordering
of our rows.
import numpy as np
import polars as pl
np.random.seed(25)
num_rows = 6
data = {
'item_id': np.random.choice(['A', 'B'], num_rows),
'store_id': np.random.choice([1, 2], num_rows),
'sold_quantity': np.random.randint(0, 5, num_rows),
'total_sku_count': np.random.choice([0, 1], num_rows),
'netsales': np.random.choice([50,100], num_rows)
}
pl_df = pl.DataFrame(data).lazy().with_row_index()
mask = pl.col('item_id') == 'A'
sub_df = (
pl_df.filter(mask)
.with_columns(netsales=pl.lit(120))
.cast(pl_df.collect_schema())
)
remaining_df = pl_df.filter(~mask)
# Reconstitute original DataFrame with new chunk
pl_df = pl.concat([remaining_df, sub_df]).sort(pl.col('index'))
print(
pl_df.collect(),
)
# shape: (6, 6)
# ┌───────┬─────────┬──────────┬───────────────┬─────────────────┬──────────┐
# │ index ┆ item_id ┆ store_id ┆ sold_quantity ┆ total_sku_count ┆ netsales │
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
# │ u32 ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
# ╞═══════╪═════════╪══════════╪═══════════════╪═════════════════╪══════════╡
# │ 0 ┆ A ┆ 2 ┆ 4 ┆ 0 ┆ 120 │
# │ 1 ┆ A ┆ 2 ┆ 4 ┆ 1 ┆ 120 │
# │ 2 ┆ A ┆ 1 ┆ 1 ┆ 1 ┆ 120 │
# │ 3 ┆ B ┆ 1 ┆ 4 ┆ 0 ┆ 50 │
# │ 4 ┆ B ┆ 2 ┆ 1 ┆ 1 ┆ 100 │
# │ 5 ┆ A ┆ 1 ┆ 3 ┆ 1 ┆ 120 │
# └───────┴─────────┴──────────┴───────────────┴─────────────────┴──────────┘
Polars also has a DataFrame.update
method which is used to mimic an in-place update.
This takes a similar approach as we did above, except it utilizes a
.join(…)….coalesce(…)
under the hood instead of concat.
import numpy as np
import polars as pl
np.random.seed(25)
num_rows = 6
data = {
'item_id': np.random.choice(['A', 'B'], num_rows),
'store_id': np.random.choice([1, 2], num_rows),
'sold_quantity': np.random.randint(0, 5, num_rows),
'total_sku_count': np.random.choice([0, 1], num_rows),
'netsales': np.random.choice([50,100], num_rows)
}
pl_df = pl.DataFrame(data).lazy().with_row_index()
sub_df = (
pl_df.filter(pl.col('item_id') == 'A')
.with_columns(netsales=pl.lit(120))
)
print(
pl_df.update(sub_df, on='index').collect()
)
# shape: (6, 6)
# ┌───────┬─────────┬──────────┬───────────────┬─────────────────┬──────────┐
# │ index ┆ item_id ┆ store_id ┆ sold_quantity ┆ total_sku_count ┆ netsales │
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
# │ u32 ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
# ╞═══════╪═════════╪══════════╪═══════════════╪═════════════════╪══════════╡
# │ 0 ┆ A ┆ 2 ┆ 4 ┆ 0 ┆ 120 │
# │ 1 ┆ A ┆ 2 ┆ 4 ┆ 1 ┆ 120 │
# │ 2 ┆ A ┆ 1 ┆ 1 ┆ 1 ┆ 120 │
# │ 3 ┆ B ┆ 1 ┆ 4 ┆ 0 ┆ 50 │
# │ 4 ┆ B ┆ 2 ┆ 1 ┆ 1 ┆ 100 │
# │ 5 ┆ A ┆ 1 ┆ 3 ┆ 1 ┆ 120 │
# └───────┴─────────┴──────────┴───────────────┴─────────────────┴──────────┘