Search code examples
rdplyrfilter

R filter out rows based on value in different row


Here's an example df:

df <- data.frame(
  id= c(1,1,1,1,2,2,2,2),
  admit_dt = c("2022-01-01", "" ,"2022-05-05","", "", "2022-09-03", "", ""),
  discharge_dt = c("2022-01-04","", "2022-05-06","","", "2022-09-04", "", ""),
  first_dt = c("2022-01-01", "2022-01-02", "2022-05-05", "2022-06-09", "2022-01-03", "2022-09-03", "2022-10-10", "2022-09-04")
)

I need to filter out rows where for each unique id, there's a date value in first_dt that is between (inclusive) the admit_dt and discharge_dt in any other row for the same id. So I want to keep row 1 in the above df, but get rid of row 2, since the value in first_dt is between the value in admit_dt and discharge_dt in row 1. I'd want to end up with this df:

id admit_dt discharge_dt first_dt
1 2022-01-01 2022-01-04 2022-01-01
1 2022-05-05 2022-05-06 2022-05-05
1 2022-06-09
2 2022-01-03
2 2022-09-03 2022-09-04 2022-09-03
2 2022-10-10

Thank you for any suggestions!


Solution

  • library(dplyr)
    
    df <- tibble(
      id= c(1,1,1,1,2,2,2,2),
      admit_dt = c("2022-01-01", "" ,"2022-05-05","", "", "2022-09-03", "", ""),
      discharge_dt = c("2022-01-04","", "2022-05-06","","", "2022-09-04", "", ""),
      first_dt = c("2022-01-01", "2022-01-02", "2022-05-05", "2022-06-09", "2022-01-03", "2022-09-03", "2022-10-10", "2022-09-04")
    )
    
    goal_df <- tibble::tribble(
      ~id,    ~admit_dt, ~discharge_dt,    ~first_dt,
      1, "2022-01-01",  "2022-01-04", "2022-01-01",
      1, "2022-05-05",  "2022-05-06", "2022-05-05",
      1,           "",             "", "2022-06-09",
      2,            "",             "", "2022-01-03",
      2, "2022-09-03",  "2022-09-04", "2022-09-03",
      2,            "",             "", "2022-10-10"
    )
    
    
    
    
    # filter out rows where for each unique id, 
    # there's a date value in first_dt 
    # that  in any other row for the same id is between (inclusive) the admit_dt and discharge_dt 
    
    # add rownumbers
    dfr <- df |> mutate(rn=row_number())
    
    # get a set of data which is all the original rows, joined to their same id, but different rownumber counterparts
    
    df2 <- left_join(dfr,
      dfr |> rename_all(\(x)paste0(x, "_2")),
      by = join_by(id == id_2, between(
        first_dt,
        admit_dt_2,
        discharge_dt_2
      ))
    ) |> mutate(dropme= rn_2 != rn)
    
    df3 <- df2 |> filter(!dropme | is.na(dropme))
    
    cleanedup <- select(df3,
                        id,admit_dt,
                        discharge_dt,
                        first_dt)
    
    identical(cleanedup,goal_df)