I have a dataset of messages being sent between two people (user A and user B) on different chats over time. I need to figure out how to track the datetime of the last, most recent user A message datetime in relation to the next messages from user B, regardless of how many sequential back-to-back messages are sent by user B. For example, let's say I have the following data:
chatID userID message_date
1 A 2023-10-11 14:32:39
1 B 2023-10-11 14:34:14
1 A 2023-10-11 14:37:22
1 A 2023-10-11 14:38:48
1 B 2023-10-11 14:42:07
1 A 2023-10-11 14:43:58
1 B 2023-10-11 14:45:36
1 B 2023-10-11 14:46:11
1 A 2023-10-11 14:50:08
1 B 2023-10-11 14:52:17
2 A 2023-10-17 09:10:28
2 A 2023-10-17 09:11:54
2 A 2023-10-17 09:12:36
2 B 2023-10-17 09:18:47
2 B 2023-10-17 09:19:22
2 A 2023-10-17 09:22:03
2 B 2023-10-17 09:24:50
2 B 2023-10-17 09:28:16
2 A 2023-10-17 09:32:07
2 A 2023-10-17 09:33:59
2 A 2023-10-17 09:34:09
2 B 2023-10-17 09:40:21
2 A 2023-10-17 09:44:48
2 B 2023-10-17 09:45:57
2 B 2023-10-17 09:46:13
2 A 2023-10-17 09:52:43
2 B 2023-10-17 09:54:01
What I am hoping to calculate is this:
chatID userID message_date need_last_texter_A_message_date
1 A 2023-10-11 14:32:39 <NA>
1 B 2023-10-11 14:34:14 2023-10-11 14:32:39
1 A 2023-10-11 14:37:22 <NA>
1 A 2023-10-11 14:38:48 <NA>
1 B 2023-10-11 14:42:07 2023-10-11 14:38:48
1 A 2023-10-11 14:43:58 <NA>
1 B 2023-10-11 14:45:36 2023-10-11 14:43:58
1 B 2023-10-11 14:46:11 2023-10-11 14:43:58
1 A 2023-10-11 14:50:08 <NA>
1 B 2023-10-11 14:52:17 2023-10-11 14:50:08
2 A 2023-10-17 09:10:28 <NA>
2 A 2023-10-17 09:11:54 <NA>
2 A 2023-10-17 09:12:36 <NA>
2 B 2023-10-17 09:18:47 2023-10-17 09:12:36
2 B 2023-10-17 09:19:22 2023-10-17 09:12:36
2 A 2023-10-17 09:22:03 <NA>
2 B 2023-10-17 09:24:50 2023-10-17 09:22:03
2 B 2023-10-17 09:28:16 2023-10-17 09:22:03
2 A 2023-10-17 09:32:07 <NA>
2 A 2023-10-17 09:33:59 <NA>
2 A 2023-10-17 09:34:09 <NA>
2 B 2023-10-17 09:40:21 2023-10-17 09:34:09
2 A 2023-10-17 09:44:48 <NA>
2 B 2023-10-17 09:45:57 2023-10-17 09:44:48
2 B 2023-10-17 09:46:13 2023-10-17 09:44:48
2 A 2023-10-17 09:52:43 <NA>
2 B 2023-10-17 09:54:01 2023-10-17 09:52:43
Here is a reproducible example of the data and outcome I'm looking for:
data = structure(list(chatID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), userID = c("A",
"B", "A", "A", "B", "A", "B", "B", "A", "B", "A", "A", "A", "B",
"B", "A", "B", "B", "A", "A", "A", "B", "A", "B", "B", "A", "B"
), message_date = structure(c(1697034759, 1697034854, 1697035042,
1697035128, 1697035327, 1697035438, 1697035536, 1697035571, 1697035808,
1697035937, 1697533828, 1697533914, 1697533956, 1697534327, 1697534362,
1697534523, 1697534690, 1697534896, 1697535127, 1697535239, 1697535249,
1697535621, 1697535888, 1697535957, 1697535973, 1697536363, 1697536441
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), need_last_texter_A_message_date = structure(c(NA,
1697034759, NA, NA, 1697035128, NA, 1697035438, 1697035438, NA,
1697035808, NA, NA, NA, 1697533956, 1697533956, NA, 1697534523,
1697534523, NA, NA, NA, 1697535249, NA, 1697535888, 1697535888,
NA, 1697536363), class = c("POSIXct", "POSIXt"), tzone = "UTC")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -27L))
How can I do this? Any help is appreciated!
The left_join(), join_by() and closest() functions in dplyr can be used together to find the nearest match. I made a new data frame of only the A rows to use as the second data frame in the join and I changed its userID column to B so I could simply match on that column. Read the help on join_by() to understand why I used a >= comparison for the message_date. I left your need_last_texter_A_message_date in place for comparison.
library(tidyverse)
data = structure(list(chatID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), userID = c("A",
"B", "A", "A", "B", "A", "B", "B", "A", "B", "A", "A", "A", "B",
"B", "A", "B", "B", "A", "A", "A", "B", "A", "B", "B", "A", "B"
), message_date = structure(c(1697034759, 1697034854, 1697035042,
1697035128, 1697035327, 1697035438, 1697035536, 1697035571, 1697035808,
1697035937, 1697533828, 1697533914, 1697533956, 1697534327, 1697534362,
1697534523, 1697534690, 1697534896, 1697535127, 1697535239, 1697535249,
1697535621, 1697535888, 1697535957, 1697535973, 1697536363, 1697536441
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), need_last_texter_A_message_date = structure(c(NA,
1697034759, NA, NA, 1697035128, NA, 1697035438, 1697035438, NA,
1697035808, NA, NA, NA, 1697533956, 1697533956, NA, 1697534523,
1697534523, NA, NA, NA, 1697535249, NA, 1697535888, 1697535888,
NA, 1697536363), class = c("POSIXct", "POSIXt"), tzone = "UTC")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -27L))
A_rows <- data |> filter(userID == "A") |> mutate(userID = "B") |>
select(userID, message_date)
OUT <- left_join(data, A_rows,
by = join_by(userID, closest(message_date >= message_date)))
OUT
#> # A tibble: 27 × 5
#> chatID userID message_date.x need_last_texter_A_me…¹ message_date.y
#> <dbl> <chr> <dttm> <dttm> <dttm>
#> 1 1 A 2023-10-11 14:32:39 NA NA
#> 2 1 B 2023-10-11 14:34:14 2023-10-11 14:32:39 2023-10-11 14:32:39
#> 3 1 A 2023-10-11 14:37:22 NA NA
#> 4 1 A 2023-10-11 14:38:48 NA NA
#> 5 1 B 2023-10-11 14:42:07 2023-10-11 14:38:48 2023-10-11 14:38:48
#> 6 1 A 2023-10-11 14:43:58 NA NA
#> 7 1 B 2023-10-11 14:45:36 2023-10-11 14:43:58 2023-10-11 14:43:58
#> 8 1 B 2023-10-11 14:46:11 2023-10-11 14:43:58 2023-10-11 14:43:58
#> 9 1 A 2023-10-11 14:50:08 NA NA
#> 10 1 B 2023-10-11 14:52:17 2023-10-11 14:50:08 2023-10-11 14:50:08
#> # ℹ 17 more rows
#> # ℹ abbreviated name: ¹need_last_texter_A_message_date
Created on 2025-01-27 with reprex v2.1.1