I am trying to use datetime stamps from one dataframe to filter and summarise data from another dataframe. This is also to be done by a grouping variable (a UserId). Here are some fictional examples of the data I am working with:-
df1
df1<-structure(list(UserId = c("6i9Gla", "6i9Gla", "6i9Gla", "6i9Gla",
"6i9Gla", "6i9Gla", "59hGIY", "59hGIY", "LzDaPX", "LzDaPX", "LzDaPX",
"LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX",
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt",
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt",
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o3wUUC", "o3wUUC", "o3wUUC",
"gXbJAq", "gXbJAq"), Duration = c(632L, 167L, 868L, 27L, 309L,
671L, 7L, 8L, 7L, 19L, 81L, 600L, 391L, 615L, 332L, 197L, 168L,
27L, 836L, 257L, 24L, 555L, 99L, 286L, 387L, 11L, 79L, 181L,
293L, 126L, 6L, 10L, 1247L, 259L, 11L, 547L, 28L, 19L, 17L, 7L,
10L), Genre = c("Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport"), DateTime_Start = structure(c(1614292441.754, 1614291282.352,
1614291509.308, 1614288742.042, 1614294373.856, 1614293122.735,
1614294911.325, 1614289403.922, 1614289358.205, 1614290574.724,
1614293909.406, 1614295977.859, 1614294049.531, 1614294711.345,
1614295613.728, 1614294488.27, 1614295400.927, 1614293277.01,
1614290364.352, 1614293571.4, 1614293873.58, 1614292529.847,
1614291353.127, 1614296784.14, 1614295424.46, 1614294012.164,
1614293909.164, 1614292191.902, 1614291777.142, 1614295930.443,
1614292521.197, 1614291208.605, 1614294092.039, 1614293283.587,
1614294040.841, 1614296214.851, 1614292701.846, 1614296929.017,
1614294151.79, 1614292835.834, 1614288948.473), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -41L), class = "data.frame")
head(df1)
UserId Duration Genre DateTime_Start
1 6i9Gla 632 Sport 2021-02-25 22:34:01
2 6i9Gla 167 Sport 2021-02-25 22:14:42
3 6i9Gla 868 Sport 2021-02-25 22:18:29
4 6i9Gla 27 Sport 2021-02-25 21:32:22
5 6i9Gla 309 Sport 2021-02-25 23:06:13
6 6i9Gla 671 Sport 2021-02-25 22:45:22
df2
df2<-structure(list(UserId = c("6i9Gla", "59hGIY", "LzDaPX", "o0fsPt",
"o3wUUC", "gXbJAq"), OrigTime = structure(c(1614288742.042, 1614289403.922,
1614289358.205, 1614290364.352, 1614292701.846, 1614288948.473
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), LastTime = structure(c(1614291509.308,
1614289403.922, 1614290574.724, 1614293909.164, 1614294151.79,
1614288948.473), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
events_recount = c(3L, 1L, 2L, 11L, 2L, 1L)), row.names = c(NA,
-6L), class = "data.frame")
head(df2)
UserId OrigTime LastTime events_recount
1 6i9Gla 2021-02-25 21:32:22 2021-02-25 22:18:29 3
2 59hGIY 2021-02-25 21:43:23 2021-02-25 21:43:23 1
3 LzDaPX 2021-02-25 21:42:38 2021-02-25 22:02:54 2
4 o0fsPt 2021-02-25 21:59:24 2021-02-25 22:58:29 11
5 o3wUUC 2021-02-25 22:38:21 2021-02-25 23:02:31 2
6 gXbJAq 2021-02-25 21:35:48 2021-02-25 21:35:48 1
Essentially, I am trying to group by UserId and then filter and summarise the rows in df1 that occur between OrigTime
and LastTime
in df2. Here is what I want to summarise between the filtered rows:-
Duration
column in df1)Duration
column in df1)Genre
in df1Genre
change (I know in this answer, it will be zero, but I need to apply the solution to a larger, more diverse real-world datasetAfter this, I would like the output of the filtering and summary to be merged back onto df2.
Any pointers on how to do this? Much appreciated!
library(data.table)
# Convert df1 and df2 to data.table format
# keep rownames in column 'rn' and set keys
setDT(df1, keep.rownames = "rn1")
setDT(df2, keep.rownames = "rn2")
# create an end-dummyvariable in df1
df1[, temp_time := DateTime_Start]
# set keys
setkey(df1, UserId, DateTime_Start, temp_time)
setkey(df2, UserId, OrigTime, LastTime)
# perform overlap join
answer <- foverlaps(df2, df1)
# now you can summarise based on this data.table