Search code examples
rdatedate-difference

Group by date difference in R


I want to group within a group by date difference.

For example, if there are 7 cases in facility A, but the first 5 cases happened before 14 days of the last 2 cases, I want them to be in two different groups (see below example)

location address start_date start_date_diff Group
Facility A 123 main st 2/7/2022 0 1
Facility A 123 main st 2/11/2022 4 1
Facility A 123 main st 2/11/2022 0 1
Facility A 123 main st 2/11/2022 0 1
Facility A 123 main st 2/12/2022 1 1
Facility A 123 main st 3/12/2022 28 2
Facility A 123 main st 3/17/2022 5 2
Facility B 55 ford rd 3/16/2022 0 3
Facility B 55 ford rd 3/16/2022 0 3
Facility C 1 step ave 3/16/2022 0 4
Facility C 1 step ave 3/20/2022 4 4
Facility C 1 step ave 3/22/2022 2 4

here is my code so far:

I am stuck on how to group them further by the date difference between individual observations.


Solution

  • Assuming we don't already diff calculated, and that we need to convert start_date into something arithmetically useful.

    data.table

    library(data.table)
    as.data.table(dat)[, start_date := as.Date(start_date, format = "%m/%d/%Y")
      ][, diff14 := cumsum(c(0, diff(start_date)) > 14), by = location
      ][, Group2 := rleid(location, diff14)][]
    #       location     address start_date start_date_diff Group diff14 Group2
    #         <char>      <char>     <Date>           <int> <int>  <int>  <int>
    #  1: Facility A 123 main st 2022-02-07               0     1      0      1
    #  2: Facility A 123 main st 2022-02-11               4     1      0      1
    #  3: Facility A 123 main st 2022-02-11               0     1      0      1
    #  4: Facility A 123 main st 2022-02-11               0     1      0      1
    #  5: Facility A 123 main st 2022-02-12               1     1      0      1
    #  6: Facility A 123 main st 2022-03-12              28     2      1      2
    #  7: Facility A 123 main st 2022-03-17               5     2      1      2
    #  8: Facility B  55 ford rd 2022-03-16               0     3      0      3
    #  9: Facility B  55 ford rd 2022-03-16               0     3      0      3
    # 10: Facility C  1 step ave 2022-03-16               0     4      0      4
    # 11: Facility C  1 step ave 2022-03-20               4     4      0      4
    # 12: Facility C  1 step ave 2022-03-22               2     4      0      4
    

    dplyr

    library(dplyr)
    dat %>%
      mutate(start_date = as.Date(start_date, format = "%m/%d/%Y")) %>%
      group_by(location) %>%
      mutate(diff14 = cumsum(c(0, diff(start_date)) > 14)) %>%
      group_by(location, diff14) %>%
      mutate(Group2 = cur_group_id()) %>%
      ungroup()
    # # A tibble: 12 x 7
    #    location   address     start_date start_date_diff Group diff14 Group2
    #    <chr>      <chr>       <date>               <int> <int>  <int>  <int>
    #  1 Facility A 123 main st 2022-02-07               0     1      0      1
    #  2 Facility A 123 main st 2022-02-11               4     1      0      1
    #  3 Facility A 123 main st 2022-02-11               0     1      0      1
    #  4 Facility A 123 main st 2022-02-11               0     1      0      1
    #  5 Facility A 123 main st 2022-02-12               1     1      0      1
    #  6 Facility A 123 main st 2022-03-12              28     2      1      2
    #  7 Facility A 123 main st 2022-03-17               5     2      1      2
    #  8 Facility B 55 ford rd  2022-03-16               0     3      0      3
    #  9 Facility B 55 ford rd  2022-03-16               0     3      0      3
    # 10 Facility C 1 step ave  2022-03-16               0     4      0      4
    # 11 Facility C 1 step ave  2022-03-20               4     4      0      4
    # 12 Facility C 1 step ave  2022-03-22               2     4      0      4
    

    Data

    read.md <- structure(list(location = c("Facility A", "Facility A", "Facility A", "Facility A", "Facility A", "Facility A", "Facility A", "Facility B", "Facility B", "Facility C", "Facility C", "Facility C"), address = c("123 main st", "123 main st", "123 main st", "123 main st", "123 main st", "123 main st", "123 main st", "55 ford rd", "55 ford rd", "1 step ave", "1 step ave", "1 step ave"), start_date = c("2/7/2022", "2/11/2022", "2/11/2022", "2/11/2022", "2/12/2022", "3/12/2022", "3/17/2022", "3/16/2022",  "3/16/2022", "3/16/2022", "3/20/2022", "3/22/2022"), start_date_diff = c(0L, 4L, 0L, 0L, 1L, 28L, 5L, 0L, 0L, 0L, 4L, 2L), Group = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L)), class = "data.frame", row.names = c(NA, -12L))