Search code examples
pythonpandasdataframefor-loopvectorization

Vectorized Solution to Iterrows


I have 2 dataframes : prediction_df and purchase_info_df. prediction_df contains customer id and prediction date. purchase_info_df contains customer id, purchase amount and purchase date. The dataframes are provided below for a single customer.

customer_id = [1, 1, 1]
prediction_date = ["2022-12-30", "2022-11-30", "2022-10-30"]
purchase_date = ["2022-11-12", "2022-12-01", "2022-09-03"]
purchase_amount = [500, 300, 100]

prediction_df = pd.DataFrame({"id":customer_id, "prediction_date":prediction_date})
purchase_info_df = pd.DataFrame({"id":customer_id,"purchase_date": purchase_date, "purchase_amount": purchase_amount})

prediction_df["prediction_date"] = pd.to_datetime(prediction_df["prediction_date"])
purchase_info_df["purchase_date"] = pd.to_datetime(purchase_info_df["purchase_date"])

My aim is to create features such as: total purchase, mean purchase amount, purchase amount in the last month etc. on the prediction_date. I can do this by the following code, which uses iterrows. This is way too slow when I have over a 100.000 customers. I am looking for a solution to vectorize the operations given in the below code, so that it will be faster.


res = []
for idx, rw in tqdm_notebook(prediction_df.iterrows(), total = prediction_df.shape[0]):
    dep_dat = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date)]
    dep_sum = dep_dat.purchase_amount.sum()
    dep_mean = dep_dat.purchase_amount.mean()
    dep_std = dep_dat.purchase_amount.std()
    dep_count = dep_dat.purchase_amount.count()

    last_15_days = rw.prediction_date - relativedelta(days = 15)
    last_30_days = rw.prediction_date - relativedelta(days = 30)
    last_45_days = rw.prediction_date - relativedelta(days = 45)
    last_60_days = rw.prediction_date - relativedelta(days = 60)
    
    
    last_15_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_15_days)].purchase_amount.sum()
    last_30_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_30_days)].purchase_amount.sum()
    last_45_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_45_days)].purchase_amount.sum()
    last_60_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_60_days)].purchase_amount.sum()
    
    last_15_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_15_days)].purchase_amount.count()
    last_30_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_30_days)].purchase_amount.count()
    last_45_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_45_days)].purchase_amount.count()
    last_60_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_60_days)].purchase_amount.count()
    
    res.append([rw.id, 
                rw.prediction_date,
                dep_sum,
                dep_mean,
                dep_count,
               last_15_days_dep_amount,
               last_30_days_dep_amount,
               last_45_days_dep_amount,
               last_60_days_dep_amount,
               last_15_days_dep_count,
               last_30_days_dep_count,
               last_45_days_dep_count,
               last_60_days_dep_count])

output = pd.DataFrame(res, columns = ["id", 
                "prediction_date",
                "amount_sum",
                "amount_mean",
                "purchase_count",
               "last_15_days_dep_amount",
               "last_30_days_dep_amount",
               "last_45_days_dep_amount",
               "last_60_days_dep_amount",
               "last_15_days_dep_count",
               "last_30_days_dep_count",
               "last_45_days_dep_count",
               "last_60_days_dep_count"])

Solution

  • Try this:

    # Merge Prediction and Purchase Info for each customer, keeping only rows where
    # purchase_date <= prediction_date.
    # Depends on big the two frames are, your computer may run out of memory.
    df = (
        prediction_df.merge(purchase_info_df, on="id")
        .query("purchase_date <= prediction_date")
    )
    
    cols = ["id", "prediction_date"]
    
    # Each each customer on each prediction date, calculate some stats
    stat0 = df.groupby(cols)["purchase_amount"].agg(["sum", "mean", "count"])
    
    # Now calculate the stats within some time windows
    stats = {}
    for t in pd.to_timedelta([15, 30, 45, 60], unit="d"):
        stats[f"last_{t.days}_days"] = (
            df[df["purchase_date"] >= df["prediction_date"] - t]
            .groupby(cols)["purchase_amount"]
            .agg(["sum", "count"])
        )
    
    # Combine the individual stats for the final result
    result = (
        pd.concat([stat0, *stats.values()], keys=["all", *stats.keys()], axis=1)
        .fillna(0)
    )