I have a product data frame that consists of 1838379 rows that have description image_url, eans, and product name this dataset has duplicates in the product name I am trying to fill the nan values in description image_url, eans with the duplicated values in product name so i implemented this function
def fill_descriptions_images_ean_from_duplicates(row,train):
import pandas as pd
duplicated_rows = train.loc[train['product_name'] == row["product_name"]]
if not duplicated_rows.empty:
if not descriptions.empty:
train.loc[train['product_name'] == row["product_name"], 'description',] = train.loc[train['product_name'] == row["product_name"], 'description'].fillna(description)
if not images.empty:
train.loc[train['product_name'] == row["product_name"], 'image_url',] = train.loc[train['product_name'] == row["product_name"], 'image_url'].fillna(image)
if not eans.empty:
train.loc[train['product_name'] == row["product_name"], 'ean',] = train.loc[train['product_name'] == row["product_name"], 'ean'].fillna(ean)
when I use apply it takes forever to execute so I tried using Pandaralele but pandaralele doesn't support the lambda function and it tells me that the fill_descriptions_images_ean_from_duplicates is not defined
from pandarallel import pandarallel
import psutil
train.parallel_apply(lambda row: fill_descriptions_images_ean_from_duplicates(row, train), axis=1)
so i tried using dask but nothing happend either the progressbar is stuck
def process_partition(df_partition,train):
df_partition.apply(lambda row: fill_descriptions_images_ean_from_duplicates(row, train), axis=1)
return df_partition
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
dask_train = dd.from_pandas(train, npartitions=7)
dask_df_applied = dask_train.map_partitions(lambda row: process_partition(row, train),meta=train.dtypes)
with ProgressBar():
sample data
import pandas as pd
import numpy as np
# Set the random seed for reproducibility
# Generate random data
data = {
'product_name': ['Product A', 'Product B', 'Product B', 'Product C', 'Product D'] * 20,
'description': np.random.choice([np.nan, 'Description'], size=100),
'image_url': np.random.choice([np.nan, 'image_url'], size=100),
'ean': np.random.choice([np.nan, 'EAN123456'], size=100)
# Create the DataFrame
train= pd.DataFrame(data)
this is the best thing I could find it reduces the time to 15 minutes
train['description'] = train.groupby('product_name')['description'].transform(lambda x: x.fillna(x.dropna().iloc[0]) if x.notnull().any() else x)
train['image_url'] = train.groupby('product_name')['image_url'].transform(lambda x: x.fillna(x.dropna().iloc[0]) if x.notnull().any() else x)
train['ean'] = train.groupby('product_name')['ean'].transform(lambda x: x.fillna(x.dropna().iloc[0]) if x.notnull().any() else x)