df %>%
select(ID, Period, Date, Event) %>%
mutate_if(is.numeric, as.character) %>%
mutate(Date = lubridate::mdy(Date)) %>%
group_by(ID, Period, Event) %>%
#filter(n() > 1) %>%
arrange(ID, Period, Event, Date) %>%
mutate(
lead_1 = Date - lead(Date),
lag_1 = Date - lag(Date)) %>%
filter(is.na(lag_1) | lag_1 > 5)
df <- structure(list(ID = c(1177, 1177, 1177, 1177, 1177, 1177), Period = c(0, 0, 0, 0, 0, 0), Date = c("04/16/20", "04/17/20", "04/20/20", "04/20/20", "04/23/20", "04/25/20"), Event = c("M", "M", "M", "M", "M", "M")), row.names = c(NA, -6L),class = c("tbl_df", "tbl", "data.frame"))
I would like to filter events that are within a five day lag of the last event recording, but the reference for lag is to the previous row itself instead of the last row which should be kept
Here is an example of what the current output of the code is Cur Output and what it should be Correct Output
Taking the lag time, I applied a rolling cumsum
that gets reset to zero every time the threshold of > 5 days have passed since the last time the threshold was reached. I extended the data by three points to show better that it works. It uses a for
loop and is not in pipes with %>%
but I think it gives the desired output. The df$out
column can then be filtered, and df$cs
removed if not needed.
library(tidyverse)
ID <- c(1177, 1177, 1177, 1177, 1177,1177,1177, 1177)
Period <- c(0, 0, 0, 0, 0, 0, 0, 0)
Date <- c("04/16/2020", "04/17/2020", "04/20/2020", "04/20/2020", "04/23/2020","04/25/2020","04/27/2020","04/29/2020")
Event <- c("M", "M", "M", "M", "M", "M", "M", "M")
df <- data.frame(ID,Period,Date,Event)
df %>%
select(ID, Period, Date, Event) %>%
mutate_if(is.numeric, as.character) %>%
mutate(Date = lubridate::mdy(Date)) %>%
group_by(ID, Period, Event) %>%
#filter(n() > 1) %>%
arrange(ID, Period, Event, Date) %>%
mutate(
lead_1 = Date - lead(Date),
lag_1 = Date - lag(Date)) %>%
mutate(lag_1 = as.numeric(lag_1)) %>%
mutate(cs = cumsum(ifelse(is.na(lag_1), 0, lag_1))) -> df
for(i in 1:length(df$lag_1)){
remaining <- c((i+1):length(df$lag_1))
if(df$cs[i] > 5){
df$cs[i] <- 0
if(i < length(df$lag_1)){
df$cs[remaining] <- cumsum(ifelse(is.na(df$lag_1[remaining]), 0, df$lag_1[remaining]))
}
}
}
df$out <- as.numeric(df$cs==0)
> df
# # A tibble: 8 × 8
# # Groups: ID, Period, Event [1]
# ID Period Date Event lead_1 lag_1 cs out
# <chr> <chr> <date> <chr> <drtn> <dbl> <dbl> <dbl>
# 1 1177 0 2020-04-16 M -1 days NA 0 1
# 2 1177 0 2020-04-17 M -3 days 1 1 0
# 3 1177 0 2020-04-20 M 0 days 3 4 0
# 4 1177 0 2020-04-20 M -3 days 0 4 0
# 5 1177 0 2020-04-23 M -2 days 3 0 1
# 6 1177 0 2020-04-25 M -2 days 2 2 0
# 7 1177 0 2020-04-27 M -2 days 2 4 0
# 8 1177 0 2020-04-29 M NA days 2 0 1