Here's an example df:
df <- data.frame(
id= c(1,1,1,1,2,2,2,2),
admit_dt = c("2022-01-01", "" ,"2022-05-05","", "", "2022-09-03", "", ""),
discharge_dt = c("2022-01-04","", "2022-05-06","","", "2022-09-04", "", ""),
first_dt = c("2022-01-01", "2022-01-02", "2022-05-05", "2022-06-09", "2022-01-03", "2022-09-03", "2022-10-10", "2022-09-04")
)
I need to filter out rows where for each unique id, there's a date value in first_dt that is between (inclusive) the admit_dt and discharge_dt in any other row for the same id. So I want to keep row 1 in the above df, but get rid of row 2, since the value in first_dt is between the value in admit_dt and discharge_dt in row 1. I'd want to end up with this df:
id | admit_dt | discharge_dt | first_dt |
---|---|---|---|
1 | 2022-01-01 | 2022-01-04 | 2022-01-01 |
1 | 2022-05-05 | 2022-05-06 | 2022-05-05 |
1 | 2022-06-09 | ||
2 | 2022-01-03 | ||
2 | 2022-09-03 | 2022-09-04 | 2022-09-03 |
2 | 2022-10-10 |
Thank you for any suggestions!
library(dplyr)
df <- tibble(
id= c(1,1,1,1,2,2,2,2),
admit_dt = c("2022-01-01", "" ,"2022-05-05","", "", "2022-09-03", "", ""),
discharge_dt = c("2022-01-04","", "2022-05-06","","", "2022-09-04", "", ""),
first_dt = c("2022-01-01", "2022-01-02", "2022-05-05", "2022-06-09", "2022-01-03", "2022-09-03", "2022-10-10", "2022-09-04")
)
goal_df <- tibble::tribble(
~id, ~admit_dt, ~discharge_dt, ~first_dt,
1, "2022-01-01", "2022-01-04", "2022-01-01",
1, "2022-05-05", "2022-05-06", "2022-05-05",
1, "", "", "2022-06-09",
2, "", "", "2022-01-03",
2, "2022-09-03", "2022-09-04", "2022-09-03",
2, "", "", "2022-10-10"
)
# filter out rows where for each unique id,
# there's a date value in first_dt
# that in any other row for the same id is between (inclusive) the admit_dt and discharge_dt
# add rownumbers
dfr <- df |> mutate(rn=row_number())
# get a set of data which is all the original rows, joined to their same id, but different rownumber counterparts
df2 <- left_join(dfr,
dfr |> rename_all(\(x)paste0(x, "_2")),
by = join_by(id == id_2, between(
first_dt,
admit_dt_2,
discharge_dt_2
))
) |> mutate(dropme= rn_2 != rn)
df3 <- df2 |> filter(!dropme | is.na(dropme))
cleanedup <- select(df3,
id,admit_dt,
discharge_dt,
first_dt)
identical(cleanedup,goal_df)