R underperforming on dplyr summarize with multiple joins and filters

I have the following example dataset containing 3 dataframes:

base_pop_ex <-
  structure(
    list(
      anon_id = c(
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
      ),
      session_number = c(1,
                         2),
      entrance_date = c("2021-06-28 11:43:21.633 Z", "2021-06-29 01:10:08.109 Z"),
      single_article_session = c(0, 0)
    ),
    .Names = c(
      "anon_id",
      "session_number",
      "entrance_date",
      "single_article_session"
    ),
    row.names = c(NA,-2L),
    class = c("tbl_df", "tbl", "data.frame")
  )

ad_views_ex <-
  structure(
    list(
      anon_id = c(
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
      ),
      ad_view_date = c(
        "2021-06-28 11:43:22.654 Z",
        "2021-06-28 11:44:15.360 Z",
        "2021-06-28 11:44:32.538 Z",
        "2021-06-28 12:07:19.557 Z",
        "2021-06-28 12:07:20.146 Z",
        "2021-06-29 01:10:08.706 Z",
        "2021-06-29 01:10:17.127 Z",
        "2021-06-29 01:40:30.726 Z",
        "2021-06-29 01:40:30.914 Z"
      ),
      ad_call_count = c(3, 1, 1, 1, 3,
                        3, 1, 1, 3)
    ),
    .Names = c("anon_id", "ad_view_date", "ad_call_count"),
    row.names = c(NA,-9L),
    class = c("tbl_df", "tbl", "data.frame")
  )

scroll_depth_ex <-
  structure(
    list(
      anon_id = c(
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
      ),
      scroll_date = c(
        "2021-06-28 11:43:38.263 Z",
        "2021-06-28 11:43:41.593 Z",
        "2021-06-28 11:43:48.882 Z",
        "2021-06-28 11:43:49.339 Z",
        "2021-06-28 11:43:52.270 Z",
        "2021-06-28 11:43:57.995 Z",
        "2021-06-28 11:44:15.324 Z",
        "2021-06-28 11:44:16.955 Z",
        "2021-06-28 11:44:30.284 Z",
        "2021-06-28 11:44:44.197 Z",
        "2021-06-28 12:07:19.564 Z",
        "2021-06-28 12:07:19.581 Z",
        "2021-06-28 12:07:19.593 Z",
        "2021-06-28 12:07:19.600 Z",
        "2021-06-28 12:07:19.617 Z",
        "2021-06-28 12:07:19.639 Z",
        "2021-06-28 12:07:19.648 Z",
        "2021-06-28 12:07:19.664 Z",
        "2021-06-29 01:10:13.401 Z",
        "2021-06-29 01:10:25.065 Z",
        "2021-06-29 01:11:02.595 Z",
        "2021-06-29 01:11:45.444 Z",
        "2021-06-29 01:40:30.741 Z",
        "2021-06-29 01:40:30.747 Z",
        "2021-06-29 01:40:30.903 Z",
        "2021-06-29 01:40:30.909 Z"
      ),
      scroll_depth = c(
        10,
        20,
        30,
        40,
        50,
        60,
        70,
        80,
        90,
        100,
        10,
        20,
        30,
        40,
        50,
        60,
        70,
        80,
        10,
        20,
        30,
        40,
        10,
        20,
        30,
        40
      )
    ),
    .Names = c("anon_id", "scroll_date",
               "scroll_depth"),
    row.names = c(NA,-26L),
    class = c("tbl_df",
              "tbl", "data.frame")
  )

I want to join all three data frames, so at the end, I get anon_id, entrance_date, session_number, ad_views and scroll_depth:

ad_views is the sum of all ad_call_counts of the data frame ad_views_ex where the ad_view_date is greater than the entrance_date from the table base_pop_ex and, at the same time, the difference in minutes between both dates is smaller than 60
scroll_depth uses the same logic, for the join, as the previous metric. However, I am calculating the max for each group of events here

The following code does its job:

library(tidyr)
library(lubridate)

combined_ex <- base_pop_ex %>%
      left_join(ad_views_ex, by = c("anon_id")) %>%
      filter(
        entrance_date <= ad_view_date &
          difftime(ad_view_date, entrance_date, units = "mins") <= 60
      ) %>%
      group_by(anon_id, entrance_date, session_number) %>%
      summarize(
        ad_views = sum(ad_call_count, na.rm = TRUE)
        )
      )
    
    combined_ex2 <- combined_ex %>%
      left_join(scroll_depth_ex, by = c("anon_id")) %>%
      filter(
        entrance_date <= scroll_date &
          difftime(scroll_date, entrance_date, units = "mins") <= 60
      ) %>%
      group_by(anon_id, entrance_date, session_number, ad_views) %>%
      summarize(
        scroll_depth = max(scroll_depth, na.rm = TRUE)
      )
    )

with the final outcome of combined_ex2 this:

|               anon_id              |       entrance_date        | session_number | ad_views | scroll_depth |
|------------------------------------|----------------------------|----------------|----------|--------------|
|0003ff12-03b1-42b9-86cf-4b7c05e3e3a7| 2021-06-28 11:43:21.633 Z  |       1        |   162    |      100     |      
|0003ff12-03b1-42b9-86cf-4b7c05e3e3a7| 2021-06-29 01:10:08.109 Z  |       2        |    64    |      40      |

However, when I scale that to my real data, Rstudio takes about 1 minute to create the first combined data frame, and 8 minutes to create the second combined data frame. My data contains 500K rows for base_pop_ex, 1.4M rows for ad_views_ex and 3.7M rows for scroll_depth_ex, what I do not consider too much.

Can anyone tell me why my code is underperforming on my data?
Also, is there a way to do the same job without having to split the join, group, and summarizes in two steps?

Solution

I think it is the filter() step. Experiment with separating the two parts, putting the difftime() second.

ex:

  ...
  filter(entrance_date <= ad_view_date) %>%
  filter(difftime(ad_view_date, entrance_date, units = "mins") <= 60) %>%
  ...