I have 2 dataframes : prediction_df
and purchase_info_df
. prediction_df
contains customer id and prediction date. purchase_info_df
contains customer id, purchase amount and purchase date. The dataframes are provided below for a single customer.
customer_id = [1, 1, 1]
prediction_date = ["2022-12-30", "2022-11-30", "2022-10-30"]
purchase_date = ["2022-11-12", "2022-12-01", "2022-09-03"]
purchase_amount = [500, 300, 100]
prediction_df = pd.DataFrame({"id":customer_id, "prediction_date":prediction_date})
purchase_info_df = pd.DataFrame({"id":customer_id,"purchase_date": purchase_date, "purchase_amount": purchase_amount})
prediction_df["prediction_date"] = pd.to_datetime(prediction_df["prediction_date"])
purchase_info_df["purchase_date"] = pd.to_datetime(purchase_info_df["purchase_date"])
My aim is to create features such as: total purchase, mean purchase amount, purchase amount in the last month etc. on the prediction_date
. I can do this by the following code, which uses iterrows. This is way too slow when I have over a 100.000 customers. I am looking for a solution to vectorize the operations given in the below code, so that it will be faster.
res = []
for idx, rw in tqdm_notebook(prediction_df.iterrows(), total = prediction_df.shape[0]):
dep_dat = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date)]
dep_sum = dep_dat.purchase_amount.sum()
dep_mean = dep_dat.purchase_amount.mean()
dep_std = dep_dat.purchase_amount.std()
dep_count = dep_dat.purchase_amount.count()
last_15_days = rw.prediction_date - relativedelta(days = 15)
last_30_days = rw.prediction_date - relativedelta(days = 30)
last_45_days = rw.prediction_date - relativedelta(days = 45)
last_60_days = rw.prediction_date - relativedelta(days = 60)
last_15_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_15_days)].purchase_amount.sum()
last_30_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_30_days)].purchase_amount.sum()
last_45_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_45_days)].purchase_amount.sum()
last_60_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_60_days)].purchase_amount.sum()
last_15_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_15_days)].purchase_amount.count()
last_30_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_30_days)].purchase_amount.count()
last_45_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_45_days)].purchase_amount.count()
last_60_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_60_days)].purchase_amount.count()
res.append([rw.id,
rw.prediction_date,
dep_sum,
dep_mean,
dep_count,
last_15_days_dep_amount,
last_30_days_dep_amount,
last_45_days_dep_amount,
last_60_days_dep_amount,
last_15_days_dep_count,
last_30_days_dep_count,
last_45_days_dep_count,
last_60_days_dep_count])
output = pd.DataFrame(res, columns = ["id",
"prediction_date",
"amount_sum",
"amount_mean",
"purchase_count",
"last_15_days_dep_amount",
"last_30_days_dep_amount",
"last_45_days_dep_amount",
"last_60_days_dep_amount",
"last_15_days_dep_count",
"last_30_days_dep_count",
"last_45_days_dep_count",
"last_60_days_dep_count"])
Try this:
# Merge Prediction and Purchase Info for each customer, keeping only rows where
# purchase_date <= prediction_date.
# Depends on big the two frames are, your computer may run out of memory.
df = (
prediction_df.merge(purchase_info_df, on="id")
.query("purchase_date <= prediction_date")
)
cols = ["id", "prediction_date"]
# Each each customer on each prediction date, calculate some stats
stat0 = df.groupby(cols)["purchase_amount"].agg(["sum", "mean", "count"])
# Now calculate the stats within some time windows
stats = {}
for t in pd.to_timedelta([15, 30, 45, 60], unit="d"):
stats[f"last_{t.days}_days"] = (
df[df["purchase_date"] >= df["prediction_date"] - t]
.groupby(cols)["purchase_amount"]
.agg(["sum", "count"])
)
# Combine the individual stats for the final result
result = (
pd.concat([stat0, *stats.values()], keys=["all", *stats.keys()], axis=1)
.fillna(0)
)