Search code examples
rtime-seriessequencelag

Use previous data to sequentially perform calculations and populate future values by group in R


I have data that looks like:

intial<-
  tibble(
start_date=
  rep(seq.Date(as.Date("2021-06-01"),as.Date("2021-10-01"),by="months"),4),
end_date=
  rep(seq.Date(as.Date("2021-07-01"),as.Date("2021-11-01"),by="months"),4),
id=
  rep(c(rep(1,5),rep(2,5)),2),
group=
  c(rep("a",10),rep("b",10)),
increase=
  c(
    c(4:8),
    c(5:9),
    c(6:10),
    c(7:11)
  ),
decrease=
  c(
    c(1:5),
    c(2:6),
    c(3:7),
    c(4:8)
  ),
start_count=
  c(c(10,13,NA,NA,NA),c(15,18,NA,NA,NA),c(20,23,NA,NA,NA),c(25,28,NA,NA,NA)),
end_count=
  c(c(13,16,NA,NA,NA),c(18,21,NA,NA,NA),c(23,26,NA,NA,NA),c(28,31,NA,NA,NA))
)
print(initial)
 start_date end_date      id group increase decrease start_count end_count
   <date>     <date>     <dbl> <chr>    <int>    <int>       <dbl>     <dbl>
 1 2021-06-01 2021-07-01     1 a            4        1          10        13
 2 2021-07-01 2021-08-01     1 a            5        2          13        16
 3 2021-08-01 2021-09-01     1 a            6        3          NA        NA
 4 2021-09-01 2021-10-01     1 a            7        4          NA        NA
 5 2021-10-01 2021-11-01     1 a            8        5          NA        NA
 6 2021-06-01 2021-07-01     2 a            5        2          15        18
 7 2021-07-01 2021-08-01     2 a            6        3          18        21
 8 2021-08-01 2021-09-01     2 a            7        4          NA        NA
 9 2021-09-01 2021-10-01     2 a            8        5          NA        NA
10 2021-10-01 2021-11-01     2 a            9        6          NA        NA
11 2021-06-01 2021-07-01     1 b            6        3          20        23
12 2021-07-01 2021-08-01     1 b            7        4          23        26
13 2021-08-01 2021-09-01     1 b            8        5          NA        NA
14 2021-09-01 2021-10-01     1 b            9        6          NA        NA
15 2021-10-01 2021-11-01     1 b           10        7          NA        NA
16 2021-06-01 2021-07-01     2 b            7        4          25        28
17 2021-07-01 2021-08-01     2 b            8        5          28        31
18 2021-08-01 2021-09-01     2 b            9        6          NA        NA
19 2021-09-01 2021-10-01     2 b           10        7          NA        NA
20 2021-10-01 2021-11-01     2 b           11        8          NA        NA

My goal is to populate each missing start_count and end_count by sequentially populating the missing start_count with the previous month's end_count, and calculate the subsequent end_count with the newly populated start_count by start_count + increase - decrease within each unique id group.

The final result should look like:

final<-
tibble(
start_date=
  rep(seq.Date(as.Date("2021-06-01"),as.Date("2021-10-01"),by="months"),4),
end_date=
  rep(seq.Date(as.Date("2021-07-01"),as.Date("2021-11-01"),by="months"),4),
id=
  rep(c(rep(1,5),rep(2,5)),2),
group=
  c(rep("a",10),rep("b",10)),
increase=
  c(
    c(4:8),
    c(5:9),
    c(6:10),
    c(7:11)
  ),
decrease=
  c(
    c(1:5),
    c(2:6),
    c(3:7),
    c(4:8)
  ),
start_count=
  c(c(10,13,16,19,22),c(15,18,21,24,27),c(20,23,26,29,32),c(25,28,31,34,37)),
end_count=
  c(c(13,16,19,22,25),c(18,21,24,27,30),c(23,26,29,32,35),c(28,31,34,37,40))
)
print(final)
  start_date end_date      id group increase decrease start_count end_count
   <date>     <date>     <dbl> <chr>    <int>    <int>       <dbl>     <dbl>
 1 2021-06-01 2021-07-01     1 a            4        1          10        13
 2 2021-07-01 2021-08-01     1 a            5        2          13        16
 3 2021-08-01 2021-09-01     1 a            6        3          16        19
 4 2021-09-01 2021-10-01     1 a            7        4          19        22
 5 2021-10-01 2021-11-01     1 a            8        5          22        25
 6 2021-06-01 2021-07-01     2 a            5        2          15        18
 7 2021-07-01 2021-08-01     2 a            6        3          18        21
 8 2021-08-01 2021-09-01     2 a            7        4          21        24
 9 2021-09-01 2021-10-01     2 a            8        5          24        27
10 2021-10-01 2021-11-01     2 a            9        6          27        30
11 2021-06-01 2021-07-01     1 b            6        3          20        23
12 2021-07-01 2021-08-01     1 b            7        4          23        26
13 2021-08-01 2021-09-01     1 b            8        5          26        29
14 2021-09-01 2021-10-01     1 b            9        6          29        32
15 2021-10-01 2021-11-01     1 b           10        7          32        35
16 2021-06-01 2021-07-01     2 b            7        4          25        28
17 2021-07-01 2021-08-01     2 b            8        5          28        31
18 2021-08-01 2021-09-01     2 b            9        6          31        34
19 2021-09-01 2021-10-01     2 b           10        7          34        37
20 2021-10-01 2021-11-01     2 b           11        8          37        40

Thanks!


Solution

  • Here is a solution. There might be better ones.

    library(tidyverse)
    
    initial<-
        tibble(
            start_date=
                rep(seq.Date(as.Date("2021-06-01"),as.Date("2021-10-01"),by="months"),4),
            end_date=
                rep(seq.Date(as.Date("2021-07-01"),as.Date("2021-11-01"),by="months"),4),
            id=
                rep(c(rep(1,5),rep(2,5)),2),
            group=
                c(rep("a",10),rep("b",10)),
            increase=
                c(
                    c(4:8),
                    c(5:9),
                    c(6:10),
                    c(7:11)
                ),
            decrease=
                c(
                    c(1:5),
                    c(2:6),
                    c(3:7),
                    c(4:8)
                ),
            start_count=
                c(c(10,13,NA,NA,NA),c(15,18,NA,NA,NA),c(20,23,NA,NA,NA),c(25,28,NA,NA,NA)),
            end_count=
                c(c(13,16,NA,NA,NA),c(18,21,NA,NA,NA),c(23,26,NA,NA,NA),c(28,31,NA,NA,NA))
        )
    
    initial %>%
        rowid_to_column(var = "obs_id") %>%
        mutate(net = increase - decrease) %>%
        
        select(obs_id, id, group, everything()) %>%
        arrange(id, group, start_date) %>%
        
        group_by(id, group) %>%
        mutate(end_count = cumsum(net) + first(start_count)) %>%
        mutate(start_count = end_count - net) %>%
        ungroup() %>%
        
        arrange(obs_id) %>%
        select(-net)
    #> # A tibble: 20 × 9
    #>    obs_id    id group start_date end_date   increase decrease start_count
    #>     <int> <dbl> <chr> <date>     <date>        <int>    <int>       <dbl>
    #>  1      1     1 a     2021-06-01 2021-07-01        4        1          10
    #>  2      2     1 a     2021-07-01 2021-08-01        5        2          13
    #>  3      3     1 a     2021-08-01 2021-09-01        6        3          16
    #>  4      4     1 a     2021-09-01 2021-10-01        7        4          19
    #>  5      5     1 a     2021-10-01 2021-11-01        8        5          22
    #>  6      6     2 a     2021-06-01 2021-07-01        5        2          15
    #>  7      7     2 a     2021-07-01 2021-08-01        6        3          18
    #>  8      8     2 a     2021-08-01 2021-09-01        7        4          21
    #>  9      9     2 a     2021-09-01 2021-10-01        8        5          24
    #> 10     10     2 a     2021-10-01 2021-11-01        9        6          27
    #> 11     11     1 b     2021-06-01 2021-07-01        6        3          20
    #> 12     12     1 b     2021-07-01 2021-08-01        7        4          23
    #> 13     13     1 b     2021-08-01 2021-09-01        8        5          26
    #> 14     14     1 b     2021-09-01 2021-10-01        9        6          29
    #> 15     15     1 b     2021-10-01 2021-11-01       10        7          32
    #> 16     16     2 b     2021-06-01 2021-07-01        7        4          25
    #> 17     17     2 b     2021-07-01 2021-08-01        8        5          28
    #> 18     18     2 b     2021-08-01 2021-09-01        9        6          31
    #> 19     19     2 b     2021-09-01 2021-10-01       10        7          34
    #> 20     20     2 b     2021-10-01 2021-11-01       11        8          37
    #> # … with 1 more variable: end_count <dbl>
    

    Created on 2021-08-11 by the reprex package (v2.0.0)