I have the following data:
dat <- structure(list(year = c(1979L, 1979L, 1979L, 1979L, 1979L, 1979L,
1979L, 1979L, 1979L, 1979L, 1979L, 1980L, 1980L, 1980L, 1980L,
1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1981L,
1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L,
1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L, 1981L,
1981L, 1981L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L,
1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L,
1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L, 1982L,
1983L, 1983L, 1983L, 1983L, 1983L, 1983L, 1983L, 1983L, 1983L,
1983L, 1984L, 1984L, 1984L, 1984L, 1984L, 1984L, 1984L, 1984L,
1984L, 1984L, 1984L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L,
1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L,
1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L,
1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L,
1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L,
1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L,
1985L, 1986L, 1986L, 1986L, 1987L, 1987L, 1987L, 1987L, 1987L,
1987L, 1987L, 1987L, 1987L, 1987L, 1987L, 1988L, 1988L, 1988L,
1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L,
1988L, 1989L, 1989L, 1989L, 1989L, 1989L, 1989L, 1989L, 1990L,
1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1990L,
1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1991L, 1991L,
1991L, 1991L, 1991L, 1991L, 1991L, 1991L, 1991L, 1991L, 1991L,
1991L, 1991L, 1991L, 1992L, 1992L, 1992L, 1992L, 1992L, 1992L,
1992L, 1992L, 1992L, 1993L, 1993L, 1993L, 1993L, 1993L, 1994L,
1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L,
1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L,
1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L,
1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1995L,
1995L, 1995L, 1995L, 1996L, 1996L, 1996L, 1996L, 1996L), mon = c(5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 5L, 5L, 5L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
5L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L), day = c(16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 13L, 14L, 15L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L,
27L, 28L, 29L, 30L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L,
30L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 18L, 19L, 20L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 29L, 30L, 31L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 24L,
25L, 26L, 27L, 28L, 29L, 30L, 31L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L,
28L, 29L, 30L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 28L, 29L,
30L, 1L, 2L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L,
30L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 17L, 18L,
19L, 20L, 28L, 29L, 30L, 1L, 2L), phase = c(6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 7L, 8L, 8L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L,
8L, 8L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 7L,
7L, 6L, 5L, 5L, 5L, 5L, 5L, 7L, 8L, 8L, 8L, 8L, 8L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L,
8L, 8L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 7L, 7L, 7L, 6L, 6L, 6L, 6L,
6L, 7L, 7L, 7L, 8L, 8L, 1L, 7L, 7L, 8L, 8L, 8L, 8L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 7L, 7L, 7L,
8L, 8L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 7L, 7L, 8L, 8L,
8L, 8L, 8L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L,
8L, 1L, 1L, 1L, 1L), Rainfall = c(23.2, 35.575, 37.4, 6.425,
10.275, 3.05, 50.075, 23.05, 2, 1.4, 3.325, 12.84, 0.68, 7.78,
0, 12.88, 91.48, 41.08, 4.48, 0, 0.26, 0, 2.32, 13.25, 64.5,
21.55, 82.175, 33.725, 48.95, 3.8, 16.875, 4.7, 7.7, 48.7, 25.275,
3.625, 0.075, 2.5, 0, 3.525, 0.725, 0.2, 0.625, 0.25, 2.85, 6.15,
10.675, 41.975, 24.975, 127.775, 86.225, 19.95, 1.725, 11.125,
0.075, 1.775, 5.825, 5.975, 18.125, 6.425, 3.725, 11.75, 13.975,
0.1, 1, 0.725, 4.775, 0.225, 2.625, 0.575, 13.375, 0, 0, 0, 0,
0, 0, 0, 0.825, 0, 0, 0.45, 2.2, 4.5, 0, 0, 0.05, 0, 0, 0.225,
0.975, 5.375, 9.1, 27.3, 47.7, 31.475, 4.8, 11.45, 3.15, 2.3,
14.975, 77.25, 112.225, 69.675, 27.625, 43.65, 34.85, 47.325,
65.725, 83.825, 29.525, 29.95, 12.575, 0, 3.2, 30.95, 26.25,
4.15, 0.025, 0.5, 0.375, 5.15, 3.525, 0, 0.55, 2.025, 10.525,
0.4, 0.05, 9.225, 0.2, 5.25, 0, 0.5, 1.3, 3.175, 7.825, 1.15,
3.475, 0.4, 0.1, 0.2, 4.275, 3.45, 0.075, 3.95, 23.525, 6.2,
5.7, 6.1, 4.975, 2.7, 0.95, 0, 0, 1.55, 37.525, 53.8, 26.275,
101.25, 81.825, 26.05, 6.4, 6.75, 0.65, 2.475, 2.7, 1.45, 0.775,
0.2, 5.8, 0.36, 0.02, 0.8, 2.64, 3.44, 26.8, 17.98, 3.88, 33.48,
8.08, 15.8, 11.52, 21.44, 31.18, 13.06, 12.92, 0.24, 4.48, 9.4,
4.24, 4.36, 2.34, 5.72, 16.56, 10.96, 24.12, 2.96, 28.48, 14.72,
6.32, 0, 0.3, 3.46, 0.62, 0.76, 0.46, 17.22, 10.92, 1.96, 2.92,
0, 0, 3.44, 3.86, 2.88, 0.72, 0, 0, 0.06, 1.62, 28.74, 0.64,
0, 0, 1.18, 0.42, 5.46, 3.56, 0.44, 0.48, 4.9, 1.48, 2.7, 19.94,
0.4, 7.28, 29.56, 8.72, 1.5, 2.32, 2.42, 4.62, 1.2, 13.88, 9.76,
26.32, 11, 23.8, 10.08, 17.04, 47.6, 15.22, 4.06, 60.3, 71.2,
16.54, 0.44, 0.68, 0, 0.88, 0.1, 0.04, 0.34, 0, 0.36)), row.names = c(NA,
266L), class = "data.frame")
There are four columns in this data: year, month, day, phase, Rainfall
I would like to:
(1) Count the number of cases when rainfall is below 5 mm/day for at least 3 consecutive days (length >= 3)
(2) and the start (first occurrence) of these consecutive days should have a phase equal to 1.
I have the following incorrect script:
dat2<-subset(dat, phase == 1)
countruns = function(x){
rainfall_rle_df <- data.frame(unclass(rle(x$Rainfall < 5)))
nrow(subset(rainfall_rle_df, lengths >= 3 & values == TRUE))
}
countruns(dat2)
This script gives me a value of 5, but this is incorrect. The correct answer for the whole data should only be 4.
So the problem is that this script ignores the starting phase of the consecutive days.
For example the year 1996 in the data above has the following values:
year month day phase Rainfall
1996 6 28 8 0.100
1996 6 29 1 0.040
1996 6 30 1 0.340
1996 7 1 1 0.000
1996 7 2 1 0.360
The starting phase is phase 8 so this should not be counted.
The years that satisfied the above conditions should be 1983, 1984, 1991, and 1993.
Any suggestions on how I can do this in R? Or is there a better way to do this in R?
I have to put the script inside a function countruns().
I'll appreciate any help!
We create grouping column with rleid
(from data.table
) after grouping by 'year', then keep the groups where all
values of 'Rainfall' are less than 5, filter
the groups where the first
value of 'phase' is 1, and do the frequency count (n()
) in summarise
library(dplyr)
library(data.table)
dat %>%
# // create a Date column as it is easier to check for consecutive days
mutate(Date = as.Date(paste(year, mon, day, sep="-"))) %>%
# // create a group with year
group_by(year) %>%
# // add more groups with rleid on logical vector
group_by(grp = rleid(Rainfall < 5),
# // checks for difference between adjacent Date
# // if the difference is greater than 1, cumsum increments by 1
grp2 = cumsum(c(0, abs(diff(Date))) > 1), .add = TRUE) %>%
# // filter groups where all Rainfall is less than 5 and number of rows >= 3
filter(all(Rainfall < 5), n() >= 3) %>%
# // filter the groups where the first value of phase is 1
filter(first(phase) == 1) %>%
# // get the frequency count
summarise(n = n(), .groups = 'drop') %>%
# // remove the columns that are not needed
select(-grp, -grp2)
# A tibble: 4 x 2
# year n
# <int> <int>
#1 1983 8
#2 1984 11
#3 1991 6
#4 1993 5
If we want to wrap it in a function, the {{}}
does the enquo
+ !!
(when we pass unquoted variable names as arguments to function esp for the column names)
f1 <- function(data, year, month, day, rainfall) {
data %>%
mutate(Date = as.Date(str_c({{year}}, {{month}},
{{day}}, sep="-"))) %>%
group_by({{year}}) %>%
group_by(grp = rleid({{rainfall}} < 5),
grp2 = cumsum(c(0, abs(diff(Date))) > 1),
.add = TRUE) %>%
filter(all({{rainfall}} < 5), n() >=3 ) %>%
filter(first(phase) == 1) %>%
summarise(n = n(), .groups = 'drop') %>%
select(-grp, -grp2)
}
-apply the function
f1(dat, year, mon, day, Rainfall)
# A tibble: 4 x 2
# year n
# <int> <int>
#1 1983 8
#2 1984 11
#3 1991 6
#4 1993 5