Search code examples
rsum

Summation over different periods and ranges of values


I would like to ask a question about how to sum over different periods.

I have the following table. To describe the table briefly, whenever event_date is not NA, the previous 3 days and post 3 days values are recorded, and every -3:3 (7 rows) are grouped by group_id.

input<-structure(list(ID = c(10001L, 10001L, 10001L, 10001L, 10001L, 
                      10001L, 10001L, 10025L, 10025L, 10025L, 10025L, 10025L, 10025L, 
                      10025L, 10025L, 10025L, 10025L, 10025L, 10025L, 10025L, 10025L, 
                      10092L, 10092L, 10092L, 10092L, 10092L, 10092L, 10092L), date = structure(c(16150L, 
                                                                                                  16153L, 16154L, 16155L, 16156L, 16157L, 16160L, 14848L, 14851L, 
                                                                                                  14852L, 14853L, 14854L, 14855L, 14859L, 17035L, 17036L, 17037L, 
                                                                                                  17037L, 17038L, 17039L, 17042L, 14726L, 14727L, 14728L, 14729L, 
                                                                                                  14732L, 14733L, 14734L), class = c("IDate", "Date")), eventdate = structure(c(NA, 
                                                                                                                                                                                NA, NA, 16155L, NA, NA, NA, NA, NA, NA, 14853L, NA, NA, NA, NA, 
                                                                                                                                                                                NA, NA, 17037L, NA, NA, NA, NA, NA, NA, 14729L, NA, NA, NA), class = c("IDate", 
                                                                                                                                                                                                                                                       "Date")), value = c(0.58810503, 0.44773911, 0.49282514, 0.17219446, 
                                                                                                                                                                                                                                                                           0.44064636, 0.1119334, 0.00571012, 0.38461538, 0.2267658, 1, 
                                                                                                                                                                                                                                                                           1, 1, 0.2, 1, 0.44805876, 0.01423221, 0.17121588, 0.17121588, 
                                                                                                                                                                                                                                                                           0.60373716, 0.11775098, 0.26185567, 0.36933798, 0.39393939, 1, 
                                                                                                                                                                                                                                                                           0.66666667, 0.53846154, 0.31428571, 0.18), order = c(-3L, -2L, 
                                                                                                                                                                                                                                                                                                                                -1L, 0L, 1L, 2L, 3L, -3L, -2L, -1L, 0L, 1L, 2L, 3L, -3L, -2L, 
                                                                                                                                                                                                                                                                                                                                -1L, 0L, 1L, 2L, 3L, -3L, -2L, -1L, 0L, 1L, 2L, 3L), `group-id` = c(1L, 
                                                                                                                                                                                                                                                                                                                                                                                                    1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
                                                                                                                                                                                                                                                                                                                                                                                                    3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L)), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                -28L), class = c("data.table", "data.frame"))


+-------+------------+------------+------------+-------+----------+
|  ID   |    date    | eventdate  |   value    | order | group-id |
+-------+------------+------------+------------+-------+----------+
| 10001 | 2014-03-21 |            | 0.58810503 |    -3 |        1 |
| 10001 | 2014-03-24 |            | 0.44773911 |    -2 |        1 |
| 10001 | 2014-03-25 |            | 0.49282514 |    -1 |        1 |
| 10001 | 2014-03-26 | 2014-03-26 | 0.17219446 |     0 |        1 |
| 10001 | 2014-03-27 |            | 0.44064636 |     1 |        1 |
| 10001 | 2014-03-28 |            |  0.1119334 |     2 |        1 |
| 10001 | 2014-03-31 |            | 0.00571012 |     3 |        1 |
| 10025 | 2010-08-27 |            | 0.38461538 |    -3 |        2 |
| 10025 | 2010-08-30 |            |  0.2267658 |    -2 |        2 |
| 10025 | 2010-08-31 |            |          1 |    -1 |        2 |
| 10025 | 2010-09-01 | 2010-09-01 |          1 |     0 |        2 |
| 10025 | 2010-09-02 |            |          1 |     1 |        2 |
| 10025 | 2010-09-03 |            |        0.2 |     2 |        2 |
| 10025 | 2010-09-07 |            |          1 |     3 |        2 |
| 10025 | 2016-08-22 |            | 0.44805876 |    -3 |        3 |
| 10025 | 2016-08-23 |            | 0.01423221 |    -2 |        3 |
| 10025 | 2016-08-24 |            | 0.17121588 |    -1 |        3 |
| 10025 | 2016-08-24 | 2016-08-24 | 0.17121588 |     0 |        3 |
| 10025 | 2016-08-25 |            | 0.60373716 |     1 |        3 |
| 10025 | 2016-08-26 |            | 0.11775098 |     2 |        3 |
| 10025 | 2016-08-29 |            | 0.26185567 |     3 |        3 |
| 10092 | 2010-04-27 |            | 0.36933798 |    -3 |        4 |
| 10092 | 2010-04-28 |            | 0.39393939 |    -2 |        4 |
| 10092 | 2010-04-29 |            |          1 |    -1 |        4 |
| 10092 | 2010-04-30 | 2010-04-30 | 0.66666667 |     0 |        4 |
| 10092 | 2010-05-03 |            | 0.53846154 |     1 |        4 |
| 10092 | 2010-05-04 |            | 0.31428571 |     2 |        4 |
| 10092 | 2010-05-05 |            |       0.18 |     3 |        4 |
+-------+------------+------------+------------+-------+----------+

Based on the above table, I would like to make a new table that sums the "value" column by each group_id, but sums the previous 3 days from the event date and the post 3 days from the event date separately. In the new table, only the rows -1:1 around the eventdate remain. For example, the first row of the new column "sumvalue" would be the first sum of the first three rows of "value" column from the input table. Also, the third row (second row is NA) of the column "sumvalue" would be the sum of the three rows of "value" column right after the first eventdate from the input table.

Below would be the desired output.

output<-structure(list(ID = c(10001L, 10001L, 10001L, 10025L, 10025L, 
                      10025L, 10025L, 10025L, 10025L, 10092L, 10092L, 10092L), date = structure(c(16154L, 
                                                                                                  16155L, 16156L, 14852L, 14853L, 14854L, 17037L, 17037L, 17038L, 
                                                                                                  14728L, 14729L, 14732L), class = c("IDate", "Date")), eventdate = structure(c(NA, 
                                                                                                                                                                                16155L, NA, NA, 14853L, NA, NA, 17037L, NA, NA, 14729L, NA), class = c("IDate", 
                                                                                                                                                                                                                                                       "Date")), value = c(0.49282514, 0.17219446, 0.44064636, 1, 1, 
                                                                                                                                                                                                                                                                           1, 0.17121588, 0.17121588, 0.60373716, 1, 0.66666667, 0.53846154
                                                                                                                                                                                                                                                       ), order = c(-1L, 0L, 1L, -1L, 0L, 1L, -1L, 0L, 1L, -1L, 0L, 
                                                                                                                                                                                                                                                                    1L), `group-id` = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 
                                                                                                                                                                                                                                                                                        4L), sumvalue = c(1.52866928, NA, 0.55828988, 1.61138118, NA, 
                                                                                                                                                                                                                                                                                                          2.2, 0.63350685, NA, 0.98334381, 1.76327737, NA, 1.03274725)), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                       -12L), class = c("data.table", "data.frame"))



+-------+------------+------------+------------+-------+----------+------------+
|  ID   |    date    | eventdate  |   value    | order | group-id |  sumvalue  |
+-------+------------+------------+------------+-------+----------+------------+
| 10001 | 2014-03-25 |            | 0.49282514 |    -1 |        1 | 1.52866928 |
| 10001 | 2014-03-26 | 2014-03-26 | 0.17219446 |     0 |        1 |            |
| 10001 | 2014-03-27 |            | 0.44064636 |     1 |        1 | 0.55828988 |
| 10025 | 2010-08-31 |            |          1 |    -1 |        2 | 1.61138118 |
| 10025 | 2010-09-01 | 2010-09-01 |          1 |     0 |        2 |            |
| 10025 | 2010-09-02 |            |          1 |     1 |        2 |        2.2 |
| 10025 | 2016-08-24 |            | 0.17121588 |    -1 |        3 | 0.63350685 |
| 10025 | 2016-08-24 | 2016-08-24 | 0.17121588 |     0 |        3 |            |
| 10025 | 2016-08-25 |            | 0.60373716 |     1 |        3 | 0.98334381 |
| 10092 | 2010-04-29 |            |          1 |    -1 |        4 | 1.76327737 |
| 10092 | 2010-04-30 | 2010-04-30 | 0.66666667 |     0 |        4 |            |
| 10092 | 2010-05-03 |            | 0.53846154 |     1 |        4 | 1.03274725 |
+-------+------------+------------+------------+-------+----------+------------+

I tried to solve this problem by first finding out event_date that has no NA by using

a<-which(is.na(input$event_date) == FALSE)-3
b<-which(is.na(input$event_date) == FALSE)-1
c<-which(is.na(input$event_date) == FALSE)+1
d<-which(is.na(input$event_date) == FALSE)+3

output<-input%>%
  group_by(group_id)%>%
  mutate(sumvalue=ifelse(order<0, sum(value[a:b], ifelse(order>0, sum(value[c:d],NA)))))

But I got stuck in the summation part.


Solution

  • One way of doing this is by also grouping on an order_id:

    output <- input %>% 
      mutate(order_id = ifelse(order == 0, NA, order < 0)) %>% 
      group_by(`group-id`, order_id) %>% 
      mutate(sumvalue = ifelse(order == 0, NA, sum(value))) %>% 
      filter(order %in% -1:1)
    

    which yields your desired output:

    > head(output)
    # A tibble: 6 x 8
    # Groups:   group-id, order_id [6]
         ID date       eventdate  value order `group-id` order_id sumvalue
      <int> <date>     <date>     <dbl> <int>      <int> <lgl>       <dbl>
    1 10001 2014-03-25 NA         0.493    -1          1 TRUE        1.53 
    2 10001 2014-03-26 2014-03-26 0.172     0          1 NA         NA    
    3 10001 2014-03-27 NA         0.441     1          1 FALSE       0.558
    4 10025 2010-08-31 NA         1        -1          2 TRUE        1.61 
    5 10025 2010-09-01 2010-09-01 1         0          2 NA         NA    
    6 10025 2010-09-02 NA         1         1          2 FALSE       2.2