Search code examples
rdatetimedplyrtimestamprolling-computation

R calculate timestamp of last message based on user ID of who sent message


I have a dataset of messages being sent between two people (user A and user B) on different chats over time. I need to figure out how to track the datetime of the last, most recent user A message datetime in relation to the next messages from user B, regardless of how many sequential back-to-back messages are sent by user B. For example, let's say I have the following data:

chatID  userID  message_date
1   A   2023-10-11 14:32:39
1   B   2023-10-11 14:34:14
1   A   2023-10-11 14:37:22
1   A   2023-10-11 14:38:48
1   B   2023-10-11 14:42:07
1   A   2023-10-11 14:43:58
1   B   2023-10-11 14:45:36
1   B   2023-10-11 14:46:11
1   A   2023-10-11 14:50:08
1   B   2023-10-11 14:52:17
2   A   2023-10-17 09:10:28
2   A   2023-10-17 09:11:54
2   A   2023-10-17 09:12:36
2   B   2023-10-17 09:18:47
2   B   2023-10-17 09:19:22
2   A   2023-10-17 09:22:03
2   B   2023-10-17 09:24:50
2   B   2023-10-17 09:28:16
2   A   2023-10-17 09:32:07
2   A   2023-10-17 09:33:59
2   A   2023-10-17 09:34:09
2   B   2023-10-17 09:40:21
2   A   2023-10-17 09:44:48
2   B   2023-10-17 09:45:57
2   B   2023-10-17 09:46:13
2   A   2023-10-17 09:52:43
2   B   2023-10-17 09:54:01

What I am hoping to calculate is this:

chatID  userID     message_date need_last_texter_A_message_date
1   A   2023-10-11 14:32:39     <NA>
1   B   2023-10-11 14:34:14     2023-10-11 14:32:39
1   A   2023-10-11 14:37:22     <NA>
1   A   2023-10-11 14:38:48     <NA>
1   B   2023-10-11 14:42:07     2023-10-11 14:38:48
1   A   2023-10-11 14:43:58     <NA>
1   B   2023-10-11 14:45:36     2023-10-11 14:43:58
1   B   2023-10-11 14:46:11     2023-10-11 14:43:58
1   A   2023-10-11 14:50:08     <NA>
1   B   2023-10-11 14:52:17     2023-10-11 14:50:08
2   A   2023-10-17 09:10:28     <NA>
2   A   2023-10-17 09:11:54     <NA>
2   A   2023-10-17 09:12:36     <NA>
2   B   2023-10-17 09:18:47     2023-10-17 09:12:36
2   B   2023-10-17 09:19:22     2023-10-17 09:12:36
2   A   2023-10-17 09:22:03     <NA>
2   B   2023-10-17 09:24:50     2023-10-17 09:22:03
2   B   2023-10-17 09:28:16     2023-10-17 09:22:03
2   A   2023-10-17 09:32:07     <NA>
2   A   2023-10-17 09:33:59     <NA>
2   A   2023-10-17 09:34:09     <NA>
2   B   2023-10-17 09:40:21     2023-10-17 09:34:09
2   A   2023-10-17 09:44:48     <NA>
2   B   2023-10-17 09:45:57     2023-10-17 09:44:48
2   B   2023-10-17 09:46:13     2023-10-17 09:44:48
2   A   2023-10-17 09:52:43     <NA>
2   B   2023-10-17 09:54:01     2023-10-17 09:52:43

Here is a reproducible example of the data and outcome I'm looking for:

data = structure(list(chatID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), userID = c("A", 
"B", "A", "A", "B", "A", "B", "B", "A", "B", "A", "A", "A", "B", 
"B", "A", "B", "B", "A", "A", "A", "B", "A", "B", "B", "A", "B"
), message_date = structure(c(1697034759, 1697034854, 1697035042, 
1697035128, 1697035327, 1697035438, 1697035536, 1697035571, 1697035808, 
1697035937, 1697533828, 1697533914, 1697533956, 1697534327, 1697534362, 
1697534523, 1697534690, 1697534896, 1697535127, 1697535239, 1697535249, 
1697535621, 1697535888, 1697535957, 1697535973, 1697536363, 1697536441
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), need_last_texter_A_message_date = structure(c(NA, 
1697034759, NA, NA, 1697035128, NA, 1697035438, 1697035438, NA, 
1697035808, NA, NA, NA, 1697533956, 1697533956, NA, 1697534523, 
1697534523, NA, NA, NA, 1697535249, NA, 1697535888, 1697535888, 
NA, 1697536363), class = c("POSIXct", "POSIXt"), tzone = "UTC")), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -27L))

How can I do this? Any help is appreciated!


Solution

  • The left_join(), join_by() and closest() functions in dplyr can be used together to find the nearest match. I made a new data frame of only the A rows to use as the second data frame in the join and I changed its userID column to B so I could simply match on that column. Read the help on join_by() to understand why I used a >= comparison for the message_date. I left your need_last_texter_A_message_date in place for comparison.

    library(tidyverse)
    data = structure(list(chatID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 
                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), userID = c("A", 
                                                                                              "B", "A", "A", "B", "A", "B", "B", "A", "B", "A", "A", "A", "B", 
                                                                                              "B", "A", "B", "B", "A", "A", "A", "B", "A", "B", "B", "A", "B"
                                     ), message_date = structure(c(1697034759, 1697034854, 1697035042, 
                                                                   1697035128, 1697035327, 1697035438, 1697035536, 1697035571, 1697035808, 
                                                                   1697035937, 1697533828, 1697533914, 1697533956, 1697534327, 1697534362, 
                                                                   1697534523, 1697534690, 1697534896, 1697535127, 1697535239, 1697535249, 
                                                                   1697535621, 1697535888, 1697535957, 1697535973, 1697536363, 1697536441
                                     ), class = c("POSIXct", "POSIXt"), tzone = "UTC"), need_last_texter_A_message_date = structure(c(NA, 
                                                                                                                                      1697034759, NA, NA, 1697035128, NA, 1697035438, 1697035438, NA, 
                                                                                                                                      1697035808, NA, NA, NA, 1697533956, 1697533956, NA, 1697534523, 
                                                                                                                                      1697534523, NA, NA, NA, 1697535249, NA, 1697535888, 1697535888, 
                                                                                                                                      NA, 1697536363), class = c("POSIXct", "POSIXt"), tzone = "UTC")), class = c("tbl_df", 
                                                                                                                                                                                                                  "tbl", "data.frame"), row.names = c(NA, -27L))
    
    A_rows <- data |> filter(userID == "A") |> mutate(userID  = "B") |> 
      select(userID, message_date)
    OUT <- left_join(data, A_rows, 
                     by = join_by(userID, closest(message_date >= message_date)))
    OUT
    #> # A tibble: 27 × 5
    #>    chatID userID message_date.x      need_last_texter_A_me…¹ message_date.y     
    #>     <dbl> <chr>  <dttm>              <dttm>                  <dttm>             
    #>  1      1 A      2023-10-11 14:32:39 NA                      NA                 
    #>  2      1 B      2023-10-11 14:34:14 2023-10-11 14:32:39     2023-10-11 14:32:39
    #>  3      1 A      2023-10-11 14:37:22 NA                      NA                 
    #>  4      1 A      2023-10-11 14:38:48 NA                      NA                 
    #>  5      1 B      2023-10-11 14:42:07 2023-10-11 14:38:48     2023-10-11 14:38:48
    #>  6      1 A      2023-10-11 14:43:58 NA                      NA                 
    #>  7      1 B      2023-10-11 14:45:36 2023-10-11 14:43:58     2023-10-11 14:43:58
    #>  8      1 B      2023-10-11 14:46:11 2023-10-11 14:43:58     2023-10-11 14:43:58
    #>  9      1 A      2023-10-11 14:50:08 NA                      NA                 
    #> 10      1 B      2023-10-11 14:52:17 2023-10-11 14:50:08     2023-10-11 14:50:08
    #> # ℹ 17 more rows
    #> # ℹ abbreviated name: ¹​need_last_texter_A_message_date
    

    Created on 2025-01-27 with reprex v2.1.1