Search code examples
rdategroup-bycumsum

Calculating cumsum of mortality before a specific date in R


I need help writing either a for loop or dply code for two things:

  1. Calculating the cumsum of mortality (%), grouped on Unit, in a defined period (7 days) before a treatment.
  2. Make a vector that counts the day post last treatment until next treatment.

The data sett looks like this:

Unit    Date    Prcent_daily.mortality  Date.treatment
A   20.07.2020  0.2 NA
A   21.07.2020  0   NA
A   22.07.2020  0.4 NA
A   23.07.2020  0.3 NA
A   24.07.2020  0.6 NA
A   25.07.2020  0.05    NA
A   26.07.2020  0   NA
A   27.07.2020  0   NA
A   28.07.2020  0.01    28.07.2020
A   29.07.2020  0.1 NA
A   30.07.2020  0.2 NA
A   31.07.2020  0   NA
A   01.08.2020  0.2 NA
A   02.08.2020  0.3 NA
A   03.08.2020  0.3 NA
A   04.08.2020  0.05    NA
A   05.08.2020  0   NA
A   06.08.2020  0   NA
A   07.08.2020  0.01    05.08.2020
A   08.08.2020  0.1 NA
A   09.08.2020  0.2 NA

And I want to achieve this:

Unit    Date    Prcent_daily.mortality  Date.treatment  akkum.7dbt  days.post.treatment
A   20.07.2020  0.2 NA  NA  NA
A   21.07.2020  0   NA  1.35    NA
A   22.07.2020  0.4 NA  1.35    NA
A   23.07.2020  0.3 NA  1.35    NA
A   24.07.2020  0.6 NA  1.35    NA
A   25.07.2020  0.05    NA  1.35    NA
A   26.07.2020  0   NA  1.35    NA
A   27.07.2020  0   NA  1.35    NA
A   28.07.2020  0.01    28.07.2020  1.35    0
A   29.07.2020  0.1 NA  NA  1
A   30.07.2020  0.2 NA  NA  2
A   31.07.2020  0   NA  0.85    3
A   01.08.2020  0.2 NA  0.85    4
A   02.08.2020  0.3 NA  0.85    5
A   03.08.2020  0.3 NA  0.85    6
A   04.08.2020  0.05    NA  0.85    7
A   05.08.2020  0   NA  0.85    8
A   06.08.2020  0   NA  0.85    9
A   07.08.2020  0.01    05.08.2020  0.85    0
A   08.08.2020  0.1 NA  NA  1
A   09.08.2020  0.2 NA  NA  2

Thanks for all help form a self learned R amateur.


Solution

  • Try this solution which combines base R and dplyr:

    library(dplyr)
    library(tidyr)
    #Create empty col for index
    i1 <- which(!is.na(df$Date.treatment))
    i2 <- i1-7
    i1 <- i1-1
    i3 <- 1:length(i1)
    #Create index for second var
    j1 <- which(!is.na(df$Date.treatment))
    j2 <- 1:length(j1)
    # i3 <- i1+1
    df$Var <- NA
    df$Var[i1]<-i3
    df$Var[i2]<-i3
    df$Var[1] <- 0
    df$Var <- ifelse(!is.na(df$Date.treatment),0,df$Var)
    #Fill
    df %>% fill(Var) -> df1
    #Create aggregations
    df1 %>% filter(Var!=0) %>% group_by(Var) %>% mutate(Cum=cumsum(Prcent_daily.mortality)) %>%
      filter(Cum==max(Cum)) %>% filter(!duplicated(Cum)) %>% ungroup() %>% select(c(Unit,Cum)) -> Ag1
    #Create another var
    df$Var2 <- NA
    df$Var2[j1] <- j2
    df$Var2[1] <- 0
    #Fill
    df %>% fill(Var2) -> df2
    #Create cums and days
    df2 %>% group_by(Unit,Var2) %>% mutate(Day=(1:n())-1) %>% ungroup() %>% select(-c(Var2))  -> df3
    #Empty var for cums
    df3$Cum <- NA
    df3$Cum[i1+1] <- Ag1$Cum
    #Fill 2
    df3 %>% fill(Cum,.direction = 'up') -> df4
    #Some adjusts
    df4$Day[1:i1[1]]<-NA
    df4$Cum[1] <- NA
    df4$Cum <- ifelse((df4$Day==1 | df4$Day==2) & !is.na(df4$Day),NA,df4$Cum)
    

    This will produce:

       Unit       Date Prcent_daily.mortality Date.treatment Var Day  Cum
    1     A 20.07.2020                   0.20           <NA>   0  NA   NA
    2     A 21.07.2020                   0.00           <NA>   1  NA 1.35
    3     A 22.07.2020                   0.40           <NA>  NA  NA 1.35
    4     A 23.07.2020                   0.30           <NA>  NA  NA 1.35
    5     A 24.07.2020                   0.60           <NA>  NA  NA 1.35
    6     A 25.07.2020                   0.05           <NA>  NA  NA 1.35
    7     A 26.07.2020                   0.00           <NA>  NA  NA 1.35
    8     A 27.07.2020                   0.00           <NA>   1  NA 1.35
    9     A 28.07.2020                   0.01     28.07.2020   0   0 1.35
    10    A 29.07.2020                   0.10           <NA>  NA   1   NA
    11    A 30.07.2020                   0.20           <NA>  NA   2   NA
    12    A 31.07.2020                   0.00           <NA>   2   3 0.85
    13    A 01.08.2020                   0.20           <NA>  NA   4 0.85
    14    A 02.08.2020                   0.30           <NA>  NA   5 0.85
    15    A 03.08.2020                   0.30           <NA>  NA   6 0.85
    16    A 04.08.2020                   0.05           <NA>  NA   7 0.85
    17    A 05.08.2020                   0.00           <NA>  NA   8 0.85
    18    A 06.08.2020                   0.00           <NA>   2   9 0.85
    19    A 07.08.2020                   0.01     05.08.2020   0   0 0.85
    20    A 08.08.2020                   0.10           <NA>  NA   1   NA
    21    A 09.08.2020                   0.20           <NA>  NA   2   NA
    

    Update: Working on df4 you can get the cumsum of Prcent_daily.mortality with next code:

    #You can work with df4 to complete the rest of aggregations
    #First create an dpuplicate var
    df4$DateD <- df4$Date.treatment
    #Now fill and mutate
    df4 %>% fill(DateD) -> df4
    #Create index for replacement
    k <- df4$Date.treatment==df4$DateD & !is.na(df4$Date.treatment)
    #Assign a value for aggregations not considered
    df4$DateD[k]<-'NULL'
    #Cumsum
    df4 %>% group_by(DateD) %>% mutate(CumAfter=cumsum(Prcent_daily.mortality)) -> df4
    #Now remove redundant values in the cum and drop the reference var
    df4 %>% ungroup() %>% mutate(CumAfter=ifelse(is.na(DateD) | DateD=='NULL',NA,CumAfter)) %>%
      select(-DateD) -> df4
    

    The output will be next:

       Unit       Date Prcent_daily.mortality Date.treatment Var Day  Cum CumAfter
    1     A 20.07.2020                   0.20           <NA>   0  NA   NA       NA
    2     A 21.07.2020                   0.00           <NA>   1  NA 1.35       NA
    3     A 22.07.2020                   0.40           <NA>  NA  NA 1.35       NA
    4     A 23.07.2020                   0.30           <NA>  NA  NA 1.35       NA
    5     A 24.07.2020                   0.60           <NA>  NA  NA 1.35       NA
    6     A 25.07.2020                   0.05           <NA>  NA  NA 1.35       NA
    7     A 26.07.2020                   0.00           <NA>  NA  NA 1.35       NA
    8     A 27.07.2020                   0.00           <NA>   1  NA 1.35       NA
    9     A 28.07.2020                   0.01     28.07.2020   0   0 1.35       NA
    10    A 29.07.2020                   0.10           <NA>  NA   1   NA     0.10
    11    A 30.07.2020                   0.20           <NA>  NA   2   NA     0.30
    12    A 31.07.2020                   0.00           <NA>   2   3 0.85     0.30
    13    A 01.08.2020                   0.20           <NA>  NA   4 0.85     0.50
    14    A 02.08.2020                   0.30           <NA>  NA   5 0.85     0.80
    15    A 03.08.2020                   0.30           <NA>  NA   6 0.85     1.10
    16    A 04.08.2020                   0.05           <NA>  NA   7 0.85     1.15
    17    A 05.08.2020                   0.00           <NA>  NA   8 0.85     1.15
    18    A 06.08.2020                   0.00           <NA>   2   9 0.85     1.15
    19    A 07.08.2020                   0.01     05.08.2020   0   0 0.85       NA
    20    A 08.08.2020                   0.10           <NA>  NA   1   NA     0.10
    21    A 09.08.2020                   0.20           <NA>  NA   2   NA     0.30