Search code examples
rfilterrle

How to apply RLE() function in R with multiple conditions?


I have the following data:

dat <- structure(list(year = c(1979L, 1979L, 1979L, 1979L, 1979L, 1979L, 
       1979L, 1979L, 1979L, 1979L, 1979L, 1980L, 1980L, 1980L, 1980L, 
       1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1981L, 
       1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 
       1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 
       1981L, 1981L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 
       1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 
       1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 
       1983L, 1983L, 1983L, 1983L, 1983L, 1983L, 1983L, 1983L, 1983L, 
       1983L, 1984L, 1984L, 1984L, 1984L, 1984L, 1984L, 1984L, 1984L, 
       1984L, 1984L, 1984L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 
       1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 
       1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 
       1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 
       1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 
       1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 
       1985L, 1986L, 1986L, 1986L, 1987L, 1987L, 1987L, 1987L, 1987L, 
       1987L, 1987L, 1987L, 1987L, 1987L, 1987L, 1988L, 1988L, 1988L, 
       1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 
       1988L, 1989L, 1989L, 1989L, 1989L, 1989L, 1989L, 1989L, 1990L, 
       1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 
       1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1991L, 1991L, 
       1991L, 1991L, 1991L, 1991L, 1991L, 1991L, 1991L, 1991L, 1991L, 
       1991L, 1991L, 1991L, 1992L, 1992L, 1992L, 1992L, 1992L, 1992L, 
       1992L, 1992L, 1992L, 1993L, 1993L, 1993L, 1993L, 1993L, 1994L, 
       1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 
       1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 
       1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 
       1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1995L, 
       1995L, 1995L, 1995L, 1996L, 1996L, 1996L, 1996L, 1996L), mon = c(5L, 
       5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 
       7L, 7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
       6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 
       7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
       7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 
       5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
       6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 
       7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
       7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 5L, 5L, 5L, 
       6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 6L, 6L, 
       6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
       5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
       6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
       6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 6L, 6L, 6L, 6L, 
       6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
       6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
       5L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L), day = c(16L, 17L, 18L, 19L, 
       20L, 21L, 22L, 23L, 24L, 25L, 26L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
       11L, 12L, 13L, 14L, 15L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
       17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 
       27L, 28L, 29L, 30L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
       11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 7L, 8L, 
       9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 7L, 8L, 9L, 10L, 11L, 
       12L, 13L, 14L, 15L, 16L, 17L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
       17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 
       30L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 
       14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 
       27L, 28L, 29L, 30L, 31L, 18L, 19L, 20L, 12L, 13L, 14L, 15L, 16L, 
       17L, 18L, 19L, 20L, 21L, 22L, 29L, 30L, 31L, 1L, 2L, 3L, 4L, 
       5L, 6L, 7L, 8L, 9L, 10L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 24L, 
       25L, 26L, 27L, 28L, 29L, 30L, 31L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
       8L, 9L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 
       28L, 29L, 30L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 28L, 29L, 
       30L, 1L, 2L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
       17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 
       30L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 17L, 18L, 
       19L, 20L, 28L, 29L, 30L, 1L, 2L), phase = c(6L, 6L, 6L, 6L, 7L, 
       7L, 7L, 7L, 7L, 8L, 8L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 
       8L, 8L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 7L, 
       7L, 6L, 5L, 5L, 5L, 5L, 5L, 7L, 8L, 8L, 8L, 8L, 8L, 1L, 1L, 1L, 
       1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 
       1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 
       1L, 1L, 1L, 2L, 2L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 
       7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 
       8L, 8L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 
       4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 7L, 7L, 7L, 6L, 6L, 6L, 6L, 
       6L, 7L, 7L, 7L, 8L, 8L, 1L, 7L, 7L, 8L, 8L, 8L, 8L, 7L, 7L, 7L, 
       7L, 7L, 7L, 7L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 7L, 7L, 7L, 
       8L, 8L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 7L, 7L, 8L, 8L, 
       8L, 8L, 8L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 
       3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 
       5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 1L, 1L, 1L, 
       1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 
       8L, 1L, 1L, 1L, 1L), Rainfall = c(23.2, 35.575, 37.4, 6.425, 
       10.275, 3.05, 50.075, 23.05, 2, 1.4, 3.325, 12.84, 0.68, 7.78, 
       0, 12.88, 91.48, 41.08, 4.48, 0, 0.26, 0, 2.32, 13.25, 64.5, 
       21.55, 82.175, 33.725, 48.95, 3.8, 16.875, 4.7, 7.7, 48.7, 25.275, 
       3.625, 0.075, 2.5, 0, 3.525, 0.725, 0.2, 0.625, 0.25, 2.85, 6.15, 
       10.675, 41.975, 24.975, 127.775, 86.225, 19.95, 1.725, 11.125, 
       0.075, 1.775, 5.825, 5.975, 18.125, 6.425, 3.725, 11.75, 13.975, 
       0.1, 1, 0.725, 4.775, 0.225, 2.625, 0.575, 13.375, 0, 0, 0, 0, 
       0, 0, 0, 0.825, 0, 0, 0.45, 2.2, 4.5, 0, 0, 0.05, 0, 0, 0.225, 
       0.975, 5.375, 9.1, 27.3, 47.7, 31.475, 4.8, 11.45, 3.15, 2.3, 
       14.975, 77.25, 112.225, 69.675, 27.625, 43.65, 34.85, 47.325, 
       65.725, 83.825, 29.525, 29.95, 12.575, 0, 3.2, 30.95, 26.25, 
       4.15, 0.025, 0.5, 0.375, 5.15, 3.525, 0, 0.55, 2.025, 10.525, 
       0.4, 0.05, 9.225, 0.2, 5.25, 0, 0.5, 1.3, 3.175, 7.825, 1.15, 
       3.475, 0.4, 0.1, 0.2, 4.275, 3.45, 0.075, 3.95, 23.525, 6.2, 
       5.7, 6.1, 4.975, 2.7, 0.95, 0, 0, 1.55, 37.525, 53.8, 26.275, 
       101.25, 81.825, 26.05, 6.4, 6.75, 0.65, 2.475, 2.7, 1.45, 0.775, 
       0.2, 5.8, 0.36, 0.02, 0.8, 2.64, 3.44, 26.8, 17.98, 3.88, 33.48, 
       8.08, 15.8, 11.52, 21.44, 31.18, 13.06, 12.92, 0.24, 4.48, 9.4, 
       4.24, 4.36, 2.34, 5.72, 16.56, 10.96, 24.12, 2.96, 28.48, 14.72, 
       6.32, 0, 0.3, 3.46, 0.62, 0.76, 0.46, 17.22, 10.92, 1.96, 2.92, 
       0, 0, 3.44, 3.86, 2.88, 0.72, 0, 0, 0.06, 1.62, 28.74, 0.64, 
       0, 0, 1.18, 0.42, 5.46, 3.56, 0.44, 0.48, 4.9, 1.48, 2.7, 19.94, 
       0.4, 7.28, 29.56, 8.72, 1.5, 2.32, 2.42, 4.62, 1.2, 13.88, 9.76, 
       26.32, 11, 23.8, 10.08, 17.04, 47.6, 15.22, 4.06, 60.3, 71.2, 
       16.54, 0.44, 0.68, 0, 0.88, 0.1, 0.04, 0.34, 0, 0.36)), row.names = c(NA, 
       266L), class = "data.frame")

There are four columns in this data: year, month, day, phase, Rainfall

I would like to:

(1) Count the number of cases when rainfall is below 5 mm/day for at least 3 consecutive days (length >= 3)

(2) and the start (first occurrence) of these consecutive days should have a phase equal to 1.

I have the following incorrect script:

 dat2<-subset(dat, phase == 1)
 countruns = function(x){
    rainfall_rle_df <- data.frame(unclass(rle(x$Rainfall < 5)))
    nrow(subset(rainfall_rle_df, lengths >= 3 & values == TRUE))
    }

 countruns(dat2)

This script gives me a value of 5, but this is incorrect. The correct answer for the whole data should only be 4.

So the problem is that this script ignores the starting phase of the consecutive days.

For example the year 1996 in the data above has the following values:

 year   month day phase Rainfall
 1996   6  28     8    0.100
 1996   6  29     1    0.040
 1996   6  30     1    0.340
 1996   7   1     1    0.000
 1996   7   2     1    0.360

The starting phase is phase 8 so this should not be counted.

The years that satisfied the above conditions should be 1983, 1984, 1991, and 1993.

Any suggestions on how I can do this in R? Or is there a better way to do this in R?

I have to put the script inside a function countruns().

I'll appreciate any help!


Solution

  • We create grouping column with rleid (from data.table) after grouping by 'year', then keep the groups where all values of 'Rainfall' are less than 5, filter the groups where the first value of 'phase' is 1, and do the frequency count (n()) in summarise

    library(dplyr)
    library(data.table)
    dat %>% 
      # // create a Date column as it is easier to check for consecutive days
      mutate(Date = as.Date(paste(year, mon, day, sep="-"))) %>% 
      # // create a group with year
      group_by(year) %>%
      # // add more groups with rleid on logical vector
      group_by(grp = rleid(Rainfall < 5), 
               # // checks for difference between adjacent Date
               # // if the difference is greater than 1, cumsum increments by 1
               grp2 = cumsum(c(0, abs(diff(Date))) > 1), .add = TRUE) %>% 
      # // filter groups where all Rainfall is less than 5 and number of rows >= 3
      filter(all(Rainfall < 5), n() >= 3) %>% 
      # // filter the groups where the first value of phase is 1
      filter(first(phase) == 1) %>%
      # // get the frequency count
      summarise(n = n(), .groups = 'drop') %>%
      # // remove the columns that are not needed
      select(-grp, -grp2)
    # A tibble: 4 x 2
    #   year     n
    #  <int> <int>
    #1  1983     8
    #2  1984    11
    #3  1991     6
    #4  1993     5
    

    If we want to wrap it in a function, the {{}} does the enquo + !! (when we pass unquoted variable names as arguments to function esp for the column names)

    f1 <- function(data, year, month, day, rainfall) {
            data %>%
               mutate(Date = as.Date(str_c({{year}}, {{month}}, 
                    {{day}}, sep="-"))) %>%
               group_by({{year}}) %>%
               group_by(grp = rleid({{rainfall}} < 5),
                        grp2 = cumsum(c(0, abs(diff(Date))) > 1),
                    .add = TRUE) %>%
               filter(all({{rainfall}} < 5), n() >=3 ) %>%
               filter(first(phase) == 1) %>%
               summarise(n = n(), .groups = 'drop') %>%
               select(-grp, -grp2)
         }
    

    -apply the function

    f1(dat, year, mon, day, Rainfall)
    # A tibble: 4 x 2
    #   year     n
    #  <int> <int>
    #1  1983     8
    #2  1984    11
    #3  1991     6
    #4  1993     5