Search code examples
rdplyrfilter

Subject-level filtering within a date range across two dataframes using dplyr


I'm looking to filter the data in one dataframe (Target) based of whether or not a value in another dataframe (Reference) falls between two dates in the Target dataframe.

For example, this would be my raw data (Target):

ParticipantId Date1 Date2
10001 1/02/2010 1/02/2015
10001 3/02/2016 1/02/2018
10001 1/02/2019 1/02/2020
10001 1/02/2021 1/02/2023
10002 1/02/2016 1/02/2018
10002 1/02/2019 1/02/2020
10002 1/02/2021 1/02/2023
10003 1/02/2013 1/02/2020
10003 1/02/2021 1/02/2023

And this would be my reference data (Reference):

ParticipantId DateA
10001 3/12/2013
10002 5/15/2022
10003 9/20/2022

What I would like is a filtered output of of Target where DateA in Reference falls between Date1 and Date2 in Target like so:

ParticipantId Date1 Date2
10001 1/02/2010 1/02/2015
10002 1/02/2021 1/02/2023
10003 1/02/2021 1/02/2023

If anyone would be able to provide some input as to how this could be done, preferably without using a join as the real Target is too big to load into memory using dplyr with pipes, I would greatly appreciate it.

Code to make the dataframes may be found below but you may need to load the lubridate library.

Reference <- structure(
  list(
    ParticipantId = 10001:10003,
    DateA = c("3/12/2013", "5/15/2022", "9/20/2022")
  ),
  class = "data.frame",
  row.names = c(NA, -3L)
)

Target <- structure(
  list(
    ParticipantId = c(
      10001L,
      10001L,
      10001L,
      10001L,
      10002L,
      10002L,
      10002L,
      10003L,
      10003L
    ),
    Date1 = c(
      "1/2/2010",
      "1/2/2016",
      "1/2/2019",
      "1/2/2021",
      "1/2/2016",
      "1/2/2019",
      "1/2/2021",
      "1/2/2019",
      "1/2/2021"
    ),
    Date2 = c(
      "1/2/2015",
      "1/2/2018",
      "1/2/2020",
      "1/2/2023",
      "1/2/2018",
      "1/2/2020",
      "1/2/2023",
      "1/2/2020",
      "1/2/2023"
    )
  ),
  class = "data.frame",

Solution

  • First you need to convert the Date columns from strings to dates. Since they are in the m-d-YYYY format, you can use lubridate::mdy to do that.

    library(dplyr)
    library(lubridate)
    Reference <- structure(
      list(
        ParticipantId = 10001:10003,
        DateA = c("3/12/2013", "5/15/2022", "9/20/2022")
      ),
      class = "data.frame",
      row.names = c(NA, -3L)
    )
    
    Target <- data.frame(
        ParticipantId = c(10001L, 10001L,10001L,10001L,10002L,10002L,10002L,10003L,10003L),
        Date1 = c("1/2/2010","1/2/2016","1/2/2019","1/2/2021","1/2/2016","1/2/2019","1/2/2021","1/2/2019","1/2/2021"),
        Date2 = c("1/2/2015","1/2/2018","1/2/2020","1/2/2023","1/2/2018","1/2/2020","1/2/2023","1/2/2020","1/2/2023"))
    
    Target$Date1 <- mdy(Target$Date1)
    Target$Date2 <- mdy(Target$Date2)
    Reference$DateA <- mdy(Reference$DateA)
    

    Next, join the data so that you have DataA corresponding to each Partipicant_ID in your Target data.

    joined_data <- left_join(Target, Reference, by = join_by(ParticipantId)) 
    print(joined_data)
      ParticipantId      Date1      Date2      DateA
    1         10001 2010-01-02 2015-01-02 2013-03-12
    2         10001 2016-01-02 2018-01-02 2013-03-12
    3         10001 2019-01-02 2020-01-02 2013-03-12
    4         10001 2021-01-02 2023-01-02 2013-03-12
    5         10002 2016-01-02 2018-01-02 2022-05-15
    6         10002 2019-01-02 2020-01-02 2022-05-15
    7         10002 2021-01-02 2023-01-02 2022-05-15
    8         10003 2019-01-02 2020-01-02 2022-09-20
    9         10003 2021-01-02 2023-01-02 2022-09-20
    

    Lastly, use dplyr::filter and dplyr::between to keep only records where DateA is between Date1 and Date2

    joined_data_between_dates_1_and_2 <- filter(joined_data, between(DateA, Date1, Date2)) 
    print(joined_data_between_dates_1_and_2)
     ParticipantId      Date1      Date2      DateA
    1         10001 2010-01-02 2015-01-02 2013-03-12
    2         10002 2021-01-02 2023-01-02 2022-05-15
    3         10003 2021-01-02 2023-01-02 2022-09-20
    

    Oh and if you don't want DateA in your final output you could add one more step:

    joined_data_between_dates_1_and_2 <- select(joined_data_between_dates_1_and_2, -DateA)
    

    If you're a bit more familiar with dplyr and 'the pipe', you could do it in one simple sequence like this:

    library(dplyr)
    library(lubridate)
    Reference <- structure(
      list(
        ParticipantId = 10001:10003,
        DateA = c("3/12/2013", "5/15/2022", "9/20/2022")
      ),
      class = "data.frame",
      row.names = c(NA, -3L)
    )
    
    Target <- data.frame(
        ParticipantId = c(10001L, 10001L,10001L,10001L,10002L,10002L,10002L,10003L,10003L),
        Date1 = c("1/2/2010","1/2/2016","1/2/2019","1/2/2021","1/2/2016","1/2/2019","1/2/2021","1/2/2019","1/2/2021"),
        Date2 = c("1/2/2015","1/2/2018","1/2/2020","1/2/2023","1/2/2018","1/2/2020","1/2/2023","1/2/2020","1/2/2023"))
    
    Reference <- Reference |> mutate(DateA = mdy(DateA))
    
    joined_data_between_dates_1_and_2 <- Target |> 
                                         mutate(Date1 = mdy(Date1), Date2 = mdy(Date2)) |> 
                                         left_join(Reference, by = join_by(ParticipantId)) |> 
                                         filter(between(DateA, Date1, Date2)) |> 
                                         select(-DateA)
    

    If Target and Reference are very big, you can use the R arrow package and the paruqet format to speed things up and only load the full results into memory once it's finished.

    library(dplyr)
    library(lubridate)
    library(arrow)
    set.seed(123) 
    # Function to generate 'n' random dates between a start and end date
    generate_random_dates <- function(n, start_date, end_date) {
      start <- as.Date(start_date)
      end <- as.Date(end_date)
      random_dates <- start + runif(n, 0, as.numeric(difftime(end, start, units = "days")))
      return(random_dates)
    }
    
    
    
    #Generate Reference data.frame with 10 million ParticipantIds
    Reference <- data.frame(
      ParticipantId = 1:(10^7),
      DateA = generate_random_dates(10^7, "2000-01-01", "2024-12-31")
    )
    
    #Generate Target data.frame with 100 million rows
    Target <- data.frame(
      ParticipantId = sample(1:(10^7), 10^8, replace = TRUE),
      Date1 = generate_random_dates(10^8, "2000-01-01", "2024-12-31"),
      Date2 = generate_random_dates(10^8, "2000-01-01", "2024-12-31")
    )
    
    #Takes about 12-13 seconds on my computer. 
    (Removed step to concert string dates to Dates 
    # because the data.frames I made already are in that format).
    system.time(Target |> 
      left_join(Reference, by = join_by(ParticipantId)) |> 
      filter(between(DateA, Date1, Date2)) |> 
      select(-DateA))
    
    
    
    user  system elapsed 
    12.19    0.63   12.82
    
    
    #Save Reference and Target as arrow parquet files.
    write_dataset(Reference, path = "reference", format = "parquet")
    write_dataset(Target, path = "target", format = "parquet")
    
    #Load arrow parquet files
    target_arrow <- open_dataset("target")
    reference_arrow <- open_dataset("reference")
    
    
    system.time(target_arrow |> 
                  left_join(reference_arrow, by = join_by(ParticipantId)) |> 
                  filter(between(DateA, Date1, Date2)) |> 
                  select(-DateA) |> 
                  collect())
    
    #About twice as fast
    user  system elapsed 
    7.58    0.37    6.29