Search code examples
rdplyr

Group temporally contiguous values


I'm trying to group Values that are temporally contiguous. However, all I can achieve is flag those Values that are contiguous (with a "yes"). The problem here is that two different groups may end up with consecutive "yes" and are hence indistinguishable:

df %>%
  mutate(contiguous = ifelse(Endtime_ms == lead(Starttime_ms)|Starttime_ms == lag(Endtime_ms), "yes", "no"),
         grp = consecutive_id(contiguous)
  ) 
# A tibble: 20 × 5
   Value                            Starttime_ms Endtime_ms contiguous   grp
   <chr>                                   <dbl>      <dbl> <chr>      <int>
 1 "on this"                                 210        780 NA             1
 2 "okay"                                   3403       3728 no             2
 3 "cool thanks everyone um"                4221       5880 no             2
 4 "so yes in"                              5910       6900 yes            3 # one group
 5 "terms of our"                           6900       8370 yes            3 # one group
 6 "partnership"                            8370       8970 yes            3 # one group
 7 "projects"                               8970       9480 yes            3 # one group
 8 "what have we"                           9510      10080 yes            3 # another group
 9 "got on the"                            10080      11293 yes            3 # another group
10 "horizon? "                             11293      11960 yes            3 # another group
11 "let's have a look so the"              11980      13740 no             4
12 "LGBTQ plus"                            13813      16110 no             4
13 "city labs"                             16260      17070 yes            5
14 "have now"                              17070      17910 yes            5
15 "been um"                               17940      19320 no             6
16 "agreed in"                             19350      20190 yes            7
17 "terms of the"                          20190      20760 yes            7
18 "date so"                               20760      21330 yes            7
19 "we're looking at the fifteenth"        21330      22530 yes            7
20 "sixteenth"                             22860      23490 NA             8

The desired output is this:

   Value                            Starttime_ms Endtime_ms contiguous   grp
   <chr>                                   <dbl>      <dbl> <chr>      <int>
 1 "on this"                                 210        780 NA             1
 2 "okay"                                   3403       3728 no             2
 3 "cool thanks everyone um"                4221       5880 no             2
 4 "so yes in"                              5910       6900 yes            3 
 5 "terms of our"                           6900       8370 yes            3 
 6 "partnership"                            8370       8970 yes            3 
 7 "projects"                               8970       9480 yes            3 
 8 "what have we"                           9510      10080 yes            4
 9 "got on the"                            10080      11293 yes            4
10 "horizon? "                             11293      11960 yes            4
11 "let's have a look so the"              11980      13740 no             4
12 "LGBTQ plus"                            13813      16110 no             5
13 "city labs"                             16260      17070 yes            6
14 "have now"                              17070      17910 yes            6
15 "been um"                               17940      19320 no             7
16 "agreed in"                             19350      20190 yes            8
17 "terms of the"                          20190      20760 yes            8
18 "date so"                               20760      21330 yes            8
19 "we're looking at the fifteenth"        21330      22530 yes            8
20 "sixteenth"                             22860      23490 NA             9

Data:

df <- structure(list(Value = c("on this", "okay", "cool thanks everyone um", 
                               "so yes in", "terms of our", "partnership", "projects", "what have we", 
                               "got on the", "horizon? ", "let's have a look so the", "LGBTQ plus", 
                               "city labs", "have now", "been um", "agreed in", "terms of the", 
                               "date so", "we're looking at the fifteenth", "sixteenth"), Starttime_ms = c(210, 
                                                                                                           3403, 4221, 5910, 6900, 8370, 8970, 9510, 10080, 11293, 11980, 
                                                                                                           13813, 16260, 17070, 17940, 19350, 20190, 20760, 21330, 22860
                               ), Endtime_ms = c(780, 3728, 5880, 6900, 8370, 8970, 9480, 10080, 
                                                 11293, 11960, 13740, 16110, 17070, 17910, 19320, 20190, 20760, 
                                                 21330, 22530, 23490)), row.names = c(NA, -20L), class = c("tbl_df", 
                                                                                                           "tbl", "data.frame"))

Solution

  • On top of what you have already, you can use igraph package for clustering and assigning group labels lbl for intermediate tagging, then update grp values according to the contiguous status

    library(igraph)
    df %>%
       mutate(
          contiguous = ifelse(Endtime_ms == lead(Starttime_ms) | Starttime_ms == lag(Endtime_ms), "yes", "no"),
          grp = consecutive_id(contiguous)
       ) %>%
       mutate(
          lbl = {
             # create a graph where the edges are built based on `Starttime_ms` and `Endtime_ms`. The vertices are clustered if the edges are linked. 
             graph_from_data_frame(df[-1]) %>%
                components() %>%
                membership()
          }[as.character(Starttime_ms)]
       ) %>%
       # refresh `lbl`, so `lbl` collapse into one cluster if the associated `grp` value is `no`; Otherwise, we retain the `lbl`
       mutate(lbl = coalesce(ifelse(contiguous == "yes", lbl, min(lbl)), lbl), .by = grp) %>%
       # re-calculate the grouping info based on updated `lbl`
       mutate(grp = consecutive_id(lbl)) %>%
       # remove auxiliary variable `lbl`
       select(-lbl)
    

    which gives

    # A tibble: 20 × 5
       Value                            Starttime_ms Endtime_ms contiguous   grp
       <chr>                                   <dbl>      <dbl> <chr>      <int>
     1 "on this"                                 210        780 NA             1
     2 "okay"                                   3403       3728 no             2
     3 "cool thanks everyone um"                4221       5880 no             2
     4 "so yes in"                              5910       6900 yes            3
     5 "terms of our"                           6900       8370 yes            3
     6 "partnership"                            8370       8970 yes            3
     7 "projects"                               8970       9480 yes            3
     8 "what have we"                           9510      10080 yes            4
     9 "got on the"                            10080      11293 yes            4
    10 "horizon? "                             11293      11960 yes            4
    11 "let's have a look so the"              11980      13740 no             5
    12 "LGBTQ plus"                            13813      16110 no             5
    13 "city labs"                             16260      17070 yes            6
    14 "have now"                              17070      17910 yes            6
    15 "been um"                               17940      19320 no             7
    16 "agreed in"                             19350      20190 yes            8
    17 "terms of the"                          20190      20760 yes            8
    18 "date so"                               20760      21330 yes            8
    19 "we're looking at the fifteenth"        21330      22530 yes            8
    20 "sixteenth"                             22860      23490 NA             9