r dataframe dplyr data-manipulation data-munging

How to reset cumulative accumulation of degree hours in a column based on a condition

I've computed the index_new column based on specific conditions. However, I'm facing an issue with resetting the calculation to zero when the variable dry_hours exceeds 5.

Here is my code:

base_temperature <- 44

df <- df %>%
 mutate(dry_hours = ifelse(lwd== 0, sequence(rle(lwd == 0)$lengths), 0)) %>%
  mutate(zero_index = lwd == 0 | dry_hours > 5 | temp < 44 | temp > 86) %>%
  group_by(event) %>%
  mutate(index_new = cumsum(ifelse(zero_index, 0,  temp - base_temperature))) %>%
  select(-zero_index) %>%
  relocate(index, .before = index_new)

Here is the reproducible example


df <- structure(list(event = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
 2, 2, 2, 2, 2, 2), lwd = c(1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 
 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
 1, 1, 1, 1, 1, 1), temp = c(40, 41, 42, 43, 44, 45, 46, 47, 48, 
 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 
 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 
 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 40, 41, 42, 43, 44, 45, 
 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 
 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 
 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90), dry_hours = c(0, 
 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 
 3, 4, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 
 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), index = c(0, 
 0, 0, 0, 0, 0, 0, 0, 0, 5, 11, 18, 26, 35, 45, 56, 68, 81, 95, 
 110, 110, 110, 110, 110, 130, 151, 173, 196, 220, 245, 245, 245, 
 245, 245, 245, 0, 0, 33, 67, 102, 138, 175, 213, 252, 292, 333, 
 375, 375, 375, 375, 375, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 11, 18, 
 26, 35, 45, 56, 68, 81, 95, 110, 110, 110, 110, 110, 130, 151, 
 173, 196, 220, 245, 271, 298, 326, 355, 385, 416, 448, 481, 515, 
 550, 586, 623, 661, 700, 740, 781, 823, 823, 823, 823, 823), 
 index_new = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 11, 18, 26, 35, 
 45, 56, 68, 81, 95, 110, 110, 110, 110, 110, 130, 151, 173, 
 196, 220, 245, 245, 245, 245, 245, 245, 245, 245, 278, 312, 
 347, 383, 420, 458, 497, 537, 578, 620, 620, 620, 620, 620, 
 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 11, 18, 26, 35, 45, 56, 68, 
 81, 95, 110, 110, 110, 110, 110, 130, 151, 173, 196, 220, 
 245, 271, 298, 326, 355, 385, 416, 448, 481, 515, 550, 586, 
 623, 661, 700, 740, 781, 823, 823, 823, 823, 823)), 
 class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, -102L),
 groups = structure(list(event = c(1, 2), .rows = structure(list(1:51, 52:102),
 ptype = integer(0), class = c("vctrs_list_of", 
 "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
 ), row.names = c(NA, -2L), .drop = TRUE))

Solution

If you want the calculation grouped by event and to reset whenever dry_hours exceeds 5, you need to add a count of when dry_hours exceeds 5 to the grouping. Change group_by(event) to group_by(event, cumsum(dry_hours > 5)):

df %>%
 mutate(dry_hours = ifelse(lwd== 0, sequence(rle(lwd == 0)$lengths), 0)) %>%
  mutate(zero_index = lwd == 0 | dry_hours > 5 | temp < 44 | temp > 86) %>%
  group_by(event, cumsum(dry_hours > 5)) %>%
  mutate(index_new = cumsum(ifelse(zero_index, 0,  temp - base_temperature))) %>%
  select(-zero_index) %>%
  relocate(index, .before = index_new) |>
  ungroup() |>
  filter(index != index_new) ## keep only rows that do not match
# A tibble: 0 × 7
# ℹ 7 variables: event <dbl>, lwd <dbl>, temp <dbl>, dry_hours <dbl>, index <dbl>, index_new <dbl>,
#   cumsum(dry_hours > 5) <int>

## all rows match!