Search code examples
rtime-seriesshuffleoverlap

How can I shuffle time blocks in time series without overlap in R?


Let's assume there is a group of 3 persons, for which I have a time series of when they start and finish an activity. An example dataframe would be:

library(tidyverse)
GrXX <- tibble(Individual = rep(c("A", "B", "C"), times = c(2, 3, 5)),
               Frame_beginning = c(1, 16, 7, 21, 29, 3, 9, 12, 19, 27),
               Frame_end = c(3, 22, 15, 24, 30, 7, 10, 12, 24, 30),
               Duration = Frame_end - Frame_beginning + 1,
               Frame_end_valid_group = 30)

This group can be represented by the following figure:enter image description here

I would love to shuffle this group around, so that the time blocks within individuals are randomly placed, without overlap within individuals (there can be overlap between individuals). Additonally, all original time blocks should also be fully on the shuffled timeline (i.e., they should not start or finish outside of the original timeline). One potential shuffling could be the following: enter image description here

My question is how to do that in R, in an efficient way. I started some code, but it seems very complex for the task, and does not prevent all overlaps. Here it is:

GrXX_shuffled <- tibble()

for (i in c("A", "B", "C")) {
  temp <- GrXX %>%
    filter(Individual == i) %>%
    arrange(-Duration, Frame_beginning)
  
  valid_frames <- c()
  Frame_shuffled_start <- c()
  Frame_shuffled_end <- c()
  Frame_earliest <- 1
  
  for (j in 1:nrow(temp)) {
    temp2 <- temp[j, ]
    
    if(j == 1) {
    valid_frames <- 1:(temp2$Frame_end_valid_group - temp2$Duration + 1)
    
    temp2 <- temp2 %>%
      mutate(Frame_beginning_shuffled = sample(valid_frames, size = 1), .after = Frame_beginning) %>%
      mutate(Frame_end_shuffled = Frame_beginning_shuffled + Duration - 1, .after = Frame_end)
    
    GrXX_shuffled <- bind_rows(GrXX_shuffled, temp2)
    }

    if(j != 1) {
      valid_frames <- c(Frame_earliest:(Frame_shuffled_start[j-1] - temp2$Duration - 1),
                        (Frame_shuffled_end[j-1] + 2):(temp2$Frame_end_valid_group - temp2$Duration + 1))
      
      valid_frames <- valid_frames[valid_frames >= 1 & valid_frames <= 30]
      
      temp2 <- temp2 %>%
        mutate(Frame_beginning_shuffled = sample(valid_frames, size = 1), .after = Frame_beginning) %>%
        mutate(Frame_end_shuffled = Frame_beginning_shuffled + Duration - 1, .after = Frame_end)
      
      GrXX_shuffled <- bind_rows(GrXX_shuffled, temp2)
    }
    
    Frame_shuffled_start <- sort(c(Frame_shuffled_start, temp2$Frame_beginning_shuffled))
    Frame_shuffled_end <- sort(c(Frame_shuffled_end, temp2$Frame_end_shuffled))
    Frame_earliest <- min(valid_frames_2)
  }
}

The idea behind the code is to:

  • for each individual select the block with the longest duration and shuffle it around on the empty timeline (and it should not start too late so that the full block is still fully on the timeline)
  • then go for the next longest block (if it exists) and place it in the rest of the timeline, without overlap with any previous already-shuffled blocks
  • repeat until no more blocks have to be shuffled

Any idea how to solve this efficiently?

Many thanks in advance :-)


Solution

  • Here's an efficient function to do what was described. It assumes discrete timesteps as in the OP's example.

    library(data.table)
    library(RcppAlgos) # for `compositionsSample`
    library(Rfast) # for `colShuffle` and `colCumSums`
    
    shuffle <- function(dt, n = 1L) {
      f <- function(v, m) {
        k <- length(v)
        k1 <- k + 1L
        as.data.table(
          cbind(
            rep = rep(1:n, each = k),
            matrix(
              colCumSums(
                `dim<-`(
                  rbind(
                    `dim<-`(t(compositionsSample(0:(m - sum(v)), k + 1, TRUE,
                                                 n = n)[,-k - 1]), NULL),
                    `dim<-`(colShuffle(matrix(v, k, n)), NULL)
                  ), c(2*k, n)
                )
              ) + 1:0,
              n*k, 2, TRUE, list(NULL, c("Frame_beginning", "Frame_end"))
            )
          )
        )[,`:=`(Duration = Frame_end - Frame_beginning + 1,
                Frame_end_valid_group = m)]
      }
      setorder(
        setcolorder(dt[,f(Duration, Frame_end_valid_group[1]), Individual], "rep"),
        rep, Individual
      )
    }
    

    Demonstrating:

    shuffle(setDT(GrXX))[]
    #>       rep Individual Frame_beginning Frame_end Duration Frame_end_valid_group
    #>     <num>     <char>           <num>     <num>    <num>                 <num>
    #>  1:     1          A               3         9        7                    30
    #>  2:     1          A              27        29        3                    30
    #>  3:     1          B               2         3        2                    30
    #>  4:     1          B               5         8        4                    30
    #>  5:     1          B              21        29        9                    30
    #>  6:     1          C               1         5        5                    30
    #>  7:     1          C               8         8        1                    30
    #>  8:     1          C              12        13        2                    30
    #>  9:     1          C              17        22        6                    30
    #> 10:     1          C              26        29        4                    30
    

    It can also do multiple shuffles using the n argument:

    shuffle(GrXX, 3)[]
    #>       rep Individual Frame_beginning Frame_end Duration Frame_end_valid_group
    #>     <num>     <char>           <num>     <num>    <num>                 <num>
    #>  1:     1          A              17        23        7                    30
    #>  2:     1          A              25        27        3                    30
    #>  3:     1          B               5         8        4                    30
    #>  4:     1          B              16        17        2                    30
    #>  5:     1          B              19        27        9                    30
    #>  6:     1          C               4         4        1                    30
    #>  7:     1          C               6         9        4                    30
    #>  8:     1          C              11        12        2                    30
    #>  9:     1          C              15        20        6                    30
    #> 10:     1          C              25        29        5                    30
    #> 11:     2          A               8        10        3                    30
    #> 12:     2          A              19        25        7                    30
    #> 13:     2          B               5         6        2                    30
    #> 14:     2          B               8        16        9                    30
    #> 15:     2          B              20        23        4                    30
    #> 16:     2          C               3         7        5                    30
    #> 17:     2          C              11        12        2                    30
    #> 18:     2          C              14        17        4                    30
    #> 19:     2          C              19        19        1                    30
    #> 20:     2          C              21        26        6                    30
    #> 21:     3          A              12        14        3                    30
    #> 22:     3          A              23        29        7                    30
    #> 23:     3          B               4         5        2                    30
    #> 24:     3          B              10        13        4                    30
    #> 25:     3          B              15        23        9                    30
    #> 26:     3          C               2         7        6                    30
    #> 27:     3          C               9        10        2                    30
    #> 28:     3          C              12        16        5                    30
    #> 29:     3          C              19        19        1                    30
    #> 30:     3          C              23        26        4                    30
    

    And it's fast:

    microbenchmark::microbenchmark(shuffle(GrXX, 100))
    #> Unit: milliseconds
    #>                expr    min     lq     mean  median      uq    max neval
    #>  shuffle(GrXX, 100) 1.9616 2.1486 2.343154 2.22235 2.36755 5.8275   100