Search code examples
rtidyverse

Renumber sequence in spell dataset


df <- structure(list(ID = c(2L, 2L, 13L, 13L, 13L, 21L, 21L, 21L, 24L, 
                            24L, 24L, 24L), mignr = c(1L, 0L, 1L, 2L, 0L, 0L, 2L, 1L, 2L, 
                                                      3L, 0L, 1L), start = c(1387L, 903L, 1357L, 1391L, 1087L, 936L, 
                                                                             1367L, 1354L, 1363L, 1392L, 908L, 1361L), end = c(1401L, 1386L, 
                                                                                                                               1390L, 1401L, 1356L, 1353L, 1399L, 1366L, 1391L, 1400L, 1360L, 
                                                                                                                               1362L), staytime = c(15L, 484L, 34L, 11L, 270L, 418L, 33L, 13L, 
                                                                                                                                                    29L, 9L, 453L, 2L)), row.names = c(NA, -12L), class = "data.frame")

My objective is to reset the sequences in this spell dataset so that they all restart at zero and for mignr greater than one increase by staytime. The data is grouped by IDs and mignr represents the sequence in the spell. I have set start == 0 end == 1 for the first spell (mignr == 0) and would like for each subsequent spell that

  • start is the previous end + 1
  • and end is start + (staytime - 1)
df <- df %>% 
  mutate(start = ifelse(mignr == 0, 0, start)) %>% 
  mutate(end = ifelse(mignr == 0, 1, end)) 


max_spell <- max(df$mignr)

for(i in seq_along(1:max_spell)){
  
  j <- i-1
  df <- df %>% 
    mutate(start = ifelse(mignr == i, (df[which(df$ID == ID & df$mignr == j),4]), start)) %>% 
    mutate(end = ifelse(mignr == i, start + (staytime - 1), end)) 
  
  
}
df

My attempt seems to work for mignr = 1 the start and end values are both as I would expect. However for subsequent values of mignr (>1) the start value is no longer what I would expect it to be. The way I am currently calculating the start is not very clean, but I can't quite tell why for subsequent iterations it is breaking.


Solution

  • I would use cumsum() on staytime, but for this to work, I'd change the first staytime of mignr == 0 to 1, since that is basically what setting the first start and end to 0 and 1 implies. Would that be acceptable?

    library(tidyverse)
    
    df %>% 
      group_by(ID) %>% 
      arrange(mignr, ,.by_group = TRUE) %>% 
      mutate(start = (row_number()- 1L),
             staytime = ifelse(row_number() == 1, 1, staytime),
             end = cumsum(staytime),
             start = lag(end + 1, default = first(start)))
    
    #> # A tibble: 12 x 5
    #> # Groups:   ID [4]
    #>       ID mignr start   end staytime
    #>    <int> <int> <dbl> <dbl>    <dbl>
    #>  1     2     0     0     1        1
    #>  2     2     1     2    16       15
    #>  3    13     0     0     1        1
    #>  4    13     1     2    35       34
    #>  5    13     2    36    46       11
    #>  6    21     0     0     1        1
    #>  7    21     1     2    14       13
    #>  8    21     2    15    47       33
    #>  9    24     0     0     1        1
    #> 10    24     1     2     3        2
    #> 11    24     2     4    32       29
    #> 12    24     3    33    41        9
    

    Data from OP

    df <- structure(list(ID = c(2L, 2L, 13L, 13L, 13L, 21L, 21L, 21L, 24L, 
                                24L, 24L, 24L), mignr = c(1L, 0L, 1L, 2L, 0L, 0L, 2L, 1L, 2L, 
                                                          3L, 0L, 1L), start = c(1387L, 903L, 1357L, 1391L, 1087L, 936L, 
                                                                                 1367L, 1354L, 1363L, 1392L, 908L, 1361L), end = c(1401L, 1386L, 
                                                                                                                                   1390L, 1401L, 1356L, 1353L, 1399L, 1366L, 1391L, 1400L, 1360L, 
                                                                                                                                   1362L), staytime = c(15L, 484L, 34L, 11L, 270L, 418L, 33L, 13L, 
                                                                                                                                                        29L, 9L, 453L, 2L)), row.names = c(NA, -12L), class = "data.frame")
    

    Created on 2023-02-20 by the reprex package (v2.0.1)