Search code examples
rdatetimegroup-by

How to group by similar datetime?


I'm trying to group together transactions that are between a period of one hour... so I'm not getting anything. follow the example below

structure(list(`data hora` = c("01/10/2023 01:00", "01/10/2023 01:32", 
"01/10/2023 01:45", "04/10/2023 03:10", "04/10/2023 03:12", "06/10/2023 01:00", 
"01/10/2023 01:00", "01/10/2023 01:32", "02/10/2023 01:45"), 
    user = c("12345ag", "12345ag", "12345ag", "12345ag", "12345ag", 
    "12345ag", "4569lç", "4569lç", "4569lç"), 
    id_transacion = c(1, 2, 3, 4, 5, 6, 7, 8, 9)))

I would like the outcome to look like this

enter image description here


Solution

  • mydf <- structure(list(`data hora` = c("01/10/2023 01:00", "01/10/2023 01:32", 
    "01/10/2023 01:45", "04/10/2023 03:10", "04/10/2023 03:12", "06/10/2023 01:00", 
    "01/10/2023 01:00", "01/10/2023 01:32", "02/10/2023 01:45"), 
        user = c("12345ag", "12345ag", "12345ag", "12345ag", "12345ag", 
        "12345ag", "4569lç", "4569lç", "4569lç"), 
        id_transacion = c(1, 2, 3, 4, 5, 6, 7, 8, 9)))
    
    mydf <- as.data.frame(mydf)
    
    library(lubridate)
    library(dplyr)
    library(tidyr)
    
    mydf |> 
      mutate(data.hora = dmy_hm(data.hora)) |> 
      group_by(user) |> 
      mutate(check = if_else(row_number() == 1, 1, NA_integer_),
             check_time = if_else(abs(data.hora - lag(data.hora)) < 60, lag(data.hora), NA_Date_),
             check = if_else(is.na(check_time), row_number(), lag(check))) |> 
      fill(check, .direction = "down") |> 
      group_by(user, check) |> 
      mutate(max_data_hora = last(data.hora),
             max_id_transacion = last(id_transacion)) |> 
      filter(row_number() == 1) |> 
      ungroup() |> 
      select(min_data_hora = data.hora, max_data_hora, user, min_id_transacion = id_transacion, max_id_transacion)
    
    # A tibble: 5 × 5
      min_data_hora       max_data_hora       user    min_id_transacion max_id_transacion
      <dttm>              <dttm>              <chr>               <dbl>             <dbl>
    1 2023-10-01 01:00:00 2023-10-01 01:45:00 12345ag                 1                 3
    2 2023-10-04 03:10:00 2023-10-04 03:12:00 12345ag                 4                 5
    3 2023-10-06 01:00:00 2023-10-06 01:00:00 12345ag                 6                 6
    4 2023-10-01 01:00:00 2023-10-01 01:32:00 4569lç                  7                 8
    5 2023-10-02 01:45:00 2023-10-02 01:45:00 4569lç                  9                 9
    

    There's likely a much more efficient method than what I strung up together, but it works with the data at hand.