I have the following example dataset containing 3 dataframes:
base_pop_ex <-
structure(
list(
anon_id = c(
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
),
session_number = c(1,
2),
entrance_date = c("2021-06-28 11:43:21.633 Z", "2021-06-29 01:10:08.109 Z"),
single_article_session = c(0, 0)
),
.Names = c(
"anon_id",
"session_number",
"entrance_date",
"single_article_session"
),
row.names = c(NA,-2L),
class = c("tbl_df", "tbl", "data.frame")
)
ad_views_ex <-
structure(
list(
anon_id = c(
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
),
ad_view_date = c(
"2021-06-28 11:43:22.654 Z",
"2021-06-28 11:44:15.360 Z",
"2021-06-28 11:44:32.538 Z",
"2021-06-28 12:07:19.557 Z",
"2021-06-28 12:07:20.146 Z",
"2021-06-29 01:10:08.706 Z",
"2021-06-29 01:10:17.127 Z",
"2021-06-29 01:40:30.726 Z",
"2021-06-29 01:40:30.914 Z"
),
ad_call_count = c(3, 1, 1, 1, 3,
3, 1, 1, 3)
),
.Names = c("anon_id", "ad_view_date", "ad_call_count"),
row.names = c(NA,-9L),
class = c("tbl_df", "tbl", "data.frame")
)
scroll_depth_ex <-
structure(
list(
anon_id = c(
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
),
scroll_date = c(
"2021-06-28 11:43:38.263 Z",
"2021-06-28 11:43:41.593 Z",
"2021-06-28 11:43:48.882 Z",
"2021-06-28 11:43:49.339 Z",
"2021-06-28 11:43:52.270 Z",
"2021-06-28 11:43:57.995 Z",
"2021-06-28 11:44:15.324 Z",
"2021-06-28 11:44:16.955 Z",
"2021-06-28 11:44:30.284 Z",
"2021-06-28 11:44:44.197 Z",
"2021-06-28 12:07:19.564 Z",
"2021-06-28 12:07:19.581 Z",
"2021-06-28 12:07:19.593 Z",
"2021-06-28 12:07:19.600 Z",
"2021-06-28 12:07:19.617 Z",
"2021-06-28 12:07:19.639 Z",
"2021-06-28 12:07:19.648 Z",
"2021-06-28 12:07:19.664 Z",
"2021-06-29 01:10:13.401 Z",
"2021-06-29 01:10:25.065 Z",
"2021-06-29 01:11:02.595 Z",
"2021-06-29 01:11:45.444 Z",
"2021-06-29 01:40:30.741 Z",
"2021-06-29 01:40:30.747 Z",
"2021-06-29 01:40:30.903 Z",
"2021-06-29 01:40:30.909 Z"
),
scroll_depth = c(
10,
20,
30,
40,
50,
60,
70,
80,
90,
100,
10,
20,
30,
40,
50,
60,
70,
80,
10,
20,
30,
40,
10,
20,
30,
40
)
),
.Names = c("anon_id", "scroll_date",
"scroll_depth"),
row.names = c(NA,-26L),
class = c("tbl_df",
"tbl", "data.frame")
)
I want to join all three data frames, so at the end, I get anon_id
, entrance_date
, session_number
, ad_views
and scroll_depth
:
ad_views
is the sum of all ad_call_counts of the data frame ad_views_ex
where the ad_view_date
is greater than the entrance_date
from the table base_pop_ex
and, at the same time, the difference in minutes between both dates is smaller than 60scroll_depth
uses the same logic, for the join, as the previous metric. However, I am calculating the max for each group of events hereThe following code does its job:
library(tidyr)
library(lubridate)
combined_ex <- base_pop_ex %>%
left_join(ad_views_ex, by = c("anon_id")) %>%
filter(
entrance_date <= ad_view_date &
difftime(ad_view_date, entrance_date, units = "mins") <= 60
) %>%
group_by(anon_id, entrance_date, session_number) %>%
summarize(
ad_views = sum(ad_call_count, na.rm = TRUE)
)
)
combined_ex2 <- combined_ex %>%
left_join(scroll_depth_ex, by = c("anon_id")) %>%
filter(
entrance_date <= scroll_date &
difftime(scroll_date, entrance_date, units = "mins") <= 60
) %>%
group_by(anon_id, entrance_date, session_number, ad_views) %>%
summarize(
scroll_depth = max(scroll_depth, na.rm = TRUE)
)
)
with the final outcome of combined_ex2
this:
| anon_id | entrance_date | session_number | ad_views | scroll_depth |
|------------------------------------|----------------------------|----------------|----------|--------------|
|0003ff12-03b1-42b9-86cf-4b7c05e3e3a7| 2021-06-28 11:43:21.633 Z | 1 | 162 | 100 |
|0003ff12-03b1-42b9-86cf-4b7c05e3e3a7| 2021-06-29 01:10:08.109 Z | 2 | 64 | 40 |
However, when I scale that to my real data, Rstudio takes about 1 minute to create the first combined data frame, and 8 minutes to create the second combined data frame. My data contains 500K rows for base_pop_ex
, 1.4M rows for ad_views_ex
and 3.7M rows for scroll_depth_ex
, what I do not consider too much.
I think it is the filter()
step. Experiment with separating the two parts, putting the difftime()
second.
ex:
...
filter(entrance_date <= ad_view_date) %>%
filter(difftime(ad_view_date, entrance_date, units = "mins") <= 60) %>%
...