Search code examples
rresetcumsum

Two cumsum conditions with reset in R


I have a dataframe that has two types of value. I'd like to slice it in groups. This groups are expected to provide two conditions. Each group should be;

  • Conditions 1: max cumulative value of w <= 75
  • Conditions 1: max cumulative value of n <= 15

If one of these criteria reach the max cumulative value, it should reset the cumulative sums and start over again for both.

id<- sample(1:33)
w <- c(2,1,32,5,1,1,12,1,2,32,32,32,1,3,2,12,1,1,1,1,1,1,5,3,5,1,1,1,2,7,2,32,1)
n <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)
df  <- data.frame(id, w, n)

the expected result (made manully)

w cumsum_w n cumsum_n group
2     2     1   1   1
1     3     1   2   1
32    35    1   3   1
5     40    1   4   1
1     41    1   5   1
1     42    1   6   1
12    54    1   7   1
1     55    1   8   1
2     57    1   9   1
32    32    1   2   2
32    64    1   3   2
32    32    1   1   3
1     33    1   2   3
3     36    1   3   3
2     38    1   4   3
12    50    1   5   3
1     51    1   6   3
1     52    1   7   3
1     53    1   8   3
1     54    1   9   3
1     55    1   10  3
1     56    1   11  3
5     61    1   12  3
3     64    1   13  3
5     69    1   14  3
1     70    1   15  3
1     1     1   1   4
1     2     1   2   4
2     4     1   3   4
7     11    1   4   4
2     13    1   5   4
32    45    1   6   4
1     46    1   7   4

I tried to solve some methods:

Method 1

library(BBmisc)
chunk(df, chunk.size = 75, n.chunks = 15)
Error in chunk(df, chunk.size = 75, n.chunks = 15) : 
  You must provide exactly one of 'chunk.size', 'n.chunks' or 'props'

Method 2

cumsum_with_reset_group <- function(w, n, threshold_w, threshold_n) {
  cumsum_w <- 0
  cumsum_n <- 0
  group <- 1
  result <- numeric()

  for (i in 1:length(w)) {
    cumsum_w <- cumsum_w + w[i]
    cumsum_n <- cumsum_n + n[i]

    if (cumsum_w > threshold_w | cumsum_n > threshold_n) {
      group <- group + 1
      cumsum_w <- cumsum_w + w[i]
      cumsum_n <- cumsum_n + n[i]
    }

    result = c(result, group)

  }

  return (result)
}

# cumsum with reset
cumsum_w_with_reset <- function(w, threshold_w) {
  cumsum_w <- 0
  group <- 1
  result <- numeric()

  for (i in 1:length(w)) {
    cumsum_w <- cumsum_w + w[i]

    if (cumsum_w > threshold_w) {
      group <- group + 1
      cumsum_w <- w[i]
    }

    result = c(result, cumsum_w)

  }

  return (result)
}


# cumsum with reset
cumsum_n_with_reset <- function(n, threshold_n) {
  cumsum_n <- 0
  group <- 1
  result <- numeric()

  for (i in 1:length(n)) {
    cumsum_n <- cumsum_n + n[i]

    if (cumsum_n > threshold_n | cumsum_w > threshold_w) {
      group <- group + 1
      cumsum_n <- n[i]
    }

    result = c(result, cumsum_n)

  }

  return (result)
}
# use functions above as window functions inside mutate statement
y<-df %>% group_by() %>%
  mutate(
    cumsum_w = cumsum_w_with_reset(w, 75),
    cumsum_n  =cumsum_n_with_reset(n, 15),
    group = cumsum_with_reset_group(w, n, 75, 15)
  ) %>% 
  ungroup()

    Error in mutate_impl(.data, dots) : 
      Evaluation error: object 'cumsum_w' not found

Thanks!


Solution

  • Here is a hack, which is done by repeated subsetting and binding. As such, this will be very slow with large data sets. This takes the whole data set as an input.

    library(dplyr)
    
    cumsumdf <- function(df){
      cumsum_75 <- function(x) {cumsum(x) %/% 76}
      cumsum_15 <- function(x) {cumsum(x) %/% 16}  
      cumsum_w75 <- function(x) {cumsum(x) %% 76}
      cumsum_n15 <- function(x) {cumsum(x) %% 16}
    
      m <- nrow(df)
    
      df$grp <- 0
      df <- df %>%
        group_by(grp) %>%
        mutate(cumsum_w = numeric(m), cumsum_n = numeric(m))
    
      n = 0
      df2 <- df[0,]
      while(nrow(df) >0 ){
        df$cumsum_w = cumsum_75(df$w)
        df$cumsum_n = cumsum_15(df$n)
    
        n <- n + 1
        df1 <- df[df$cumsum_n == 0 & df$cumsum_w == 0,]
        df <- df[df$cumsum_n != 0 | df$cumsum_w != 0,]
        df1$grp <- n  
        df1 <- df1 %>% group_by(grp) %>%
          mutate(cumsum_w = cumsum_w75(w), cumsum_n = cumsum_n15(n))
        df2 <- rbind(df2,df1)
      }
      return(df2)
    }
    
    cumsumdf(df)