Search code examples
rdplyrlubridate

Remove the cumulative sum by combination of dates and breaks


I have a data set (ds) with the variables ``,date, values and break. values is a cumulative sum until break == "True" and after the data start the process of cumulative sum again. In my example:

# Packages
library(dplyr)
library(lubridate)


# My data set
ds <- read.csv("https://raw.githubusercontent.com/Leprechault/trash/main/accumulated_values_table")
ds$date <- ymd(ds$date)
str(ds)

# 'data.frame': 4458 obs. of  4 variables:
# $ id    : int  0 1 2 2 1 0 3 2 4 1 ...
# $ date  : Date, format: "2020-11-26" "2020-11-26" "2020-11-26" "2020-11-27" ...
# $ values: int  36 33 27 22 34 28 5 18 28 24 ...
# $ break.: chr  "False" "False" "False" "False" ...

I'd like to find any way to remove the cumulative sum and the desired output is the daily number of values by id. I need some help for any function to make the subtraction of day+1 - day until my column break == "True" for each id. For example to my id=3:

# Actual dataset
ds.3 <- ds %>% filter(id==3)
head(ds.3)
#   id       date values break.
# 1  3 2020-11-27      5   True
# 2  3 2020-12-03     16  False
# 3  3 2020-12-05     18  False
# 4  3 2020-12-08      7   True
# 5  3 2020-12-09     27  False
# 6  3 2020-12-19     18  False

# Desirable dataset
#   id       date values break.
# 1  3 2020-11-27      5   True
# 2  3 2020-12-03     16  False
# 3  3 2020-12-05      2  False
# 4  3 2020-12-08      5   True
# 5  3 2020-12-09     27  False
# 6  3 2020-12-19      0  False * PS: if the sum is negative, the the value is equal to 0

Please any ideas?


Solution

  • I'm inferring that we want to group by rows between "True" (by id too) and reassign values only where break. is not "True".

    dplyr

    library(dplyr)
    out <- ds %>%
      mutate(break. = break. == "True") %>%
      group_by(id) %>%
      mutate(grp = cumsum(break.)) %>%
      group_by(id, grp, break.) %>%
      mutate(values2 = if (break.[1]) values else pmax(0, c(values[1], diff(values)))) %>%
      ungroup()
    out
    # # A tibble: 4,458 x 6
    #       id date       values break.   grp values2
    #    <int> <chr>       <int> <lgl>  <int>   <dbl>
    #  1     0 2020-11-26     36 FALSE      0      36
    #  2     1 2020-11-26     33 FALSE      0      33
    #  3     2 2020-11-26     27 FALSE      0      27
    #  4     2 2020-11-27     22 FALSE      0       0
    #  5     1 2020-11-27     34 TRUE       1      34
    #  6     0 2020-11-27     28 FALSE      0       0
    #  7     3 2020-11-27      5 TRUE       1       5
    #  8     2 2020-11-28     18 FALSE      0       0
    #  9     4 2020-11-28     28 FALSE      0      28
    # 10     1 2020-11-28     24 FALSE      1      24
    # # ... with 4,448 more rows
    

    And we can look at just id == 3:

    filter(out, id == 3)
    # # A tibble: 17 x 6
    #       id date       values break.   grp values2
    #    <int> <chr>       <int> <lgl>  <int>   <dbl>
    #  1     3 2020-11-27      5 TRUE       1       5
    #  2     3 2020-12-03     16 FALSE      1      16
    #  3     3 2020-12-05     18 FALSE      1       2
    #  4     3 2020-12-08      7 TRUE       2       7
    #  5     3 2020-12-09     27 FALSE      2      27
    #  6     3 2020-12-19     18 FALSE      2       0
    #  7     3 2020-12-21      2 FALSE      2       0
    #  8     3 2020-12-23     18 FALSE      2      16
    #  9     3 2020-12-27     15 FALSE      2       0
    # 10     3 2020-12-29     11 FALSE      2       0
    # 11     3 2021-01-10     11 FALSE      2       0
    # 12     3 2021-01-12     14 FALSE      2       3
    # 13     3 2021-01-27     10 FALSE      2       0
    # 14     3 2021-01-29     15 FALSE      2       5
    # 15     3 2021-01-31     15 FALSE      2       0
    # 16     3 2021-02-08     16 FALSE      2       1
    # 17     3 2021-02-09      8 FALSE      2       0
    

    base R

    ds$break. <- ds$break. == "True"
    ds$grp <- ave(ds$break., ds$id, FUN = cumsum)
    ds$values2 <- ifelse(
      ds$break. == "True", ds$values,
      ave(ds$values, ds[c("id","grp","break.")],
          FUN = function(z) pmax(0, c(z[1], diff(z))))
      )
    subset(ds, id == 3)
    #     id       date values break. grp values2
    # 7    3 2020-11-27      5   TRUE   1       5
    # 35   3 2020-12-03     16  FALSE   1      16
    # 48   3 2020-12-05     18  FALSE   1       2
    # 64   3 2020-12-08      7   TRUE   2       7
    # 74   3 2020-12-09     27  FALSE   2      27
    # 182  3 2020-12-19     18  FALSE   2       0
    # 201  3 2020-12-21      2  FALSE   2       0
    # 220  3 2020-12-23     18  FALSE   2      16
    # 249  3 2020-12-27     15  FALSE   2       0
    # 277  3 2020-12-29     11  FALSE   2       0
    # 380  3 2021-01-10     11  FALSE   2       0
    # 390  3 2021-01-12     14  FALSE   2       3
    # 509  3 2021-01-27     10  FALSE   2       0
    # 524  3 2021-01-29     15  FALSE   2       5
    # 539  3 2021-01-31     15  FALSE   2       0
    # 608  3 2021-02-08     16  FALSE   2       1
    # 619  3 2021-02-09      8  FALSE   2       0