Subject-level filtering within a date range across two dataframes using dplyr

I'm looking to filter the data in one dataframe (Target) based of whether or not a value in another dataframe (Reference) falls between two dates in the Target dataframe.

For example, this would be my raw data (Target):

ParticipantId	Date1	Date2
10001	1/02/2010	1/02/2015
10001	3/02/2016	1/02/2018
10001	1/02/2019	1/02/2020
10001	1/02/2021	1/02/2023
10002	1/02/2016	1/02/2018
10002	1/02/2019	1/02/2020
10002	1/02/2021	1/02/2023
10003	1/02/2013	1/02/2020
10003	1/02/2021	1/02/2023

And this would be my reference data (Reference):

ParticipantId	DateA
10001	3/12/2013
10002	5/15/2022
10003	9/20/2022

What I would like is a filtered output of of Target where DateA in Reference falls between Date1 and Date2 in Target like so:

ParticipantId	Date1	Date2
10001	1/02/2010	1/02/2015
10002	1/02/2021	1/02/2023
10003	1/02/2021	1/02/2023

If anyone would be able to provide some input as to how this could be done, preferably without using a join as the real Target is too big to load into memory using dplyr with pipes, I would greatly appreciate it.

Code to make the dataframes may be found below but you may need to load the lubridate library.

Reference <- structure(
  list(
    ParticipantId = 10001:10003,
    DateA = c("3/12/2013", "5/15/2022", "9/20/2022")
  ),
  class = "data.frame",
  row.names = c(NA, -3L)
)

Target <- structure(
  list(
    ParticipantId = c(
      10001L,
      10001L,
      10001L,
      10001L,
      10002L,
      10002L,
      10002L,
      10003L,
      10003L
    ),
    Date1 = c(
      "1/2/2010",
      "1/2/2016",
      "1/2/2019",
      "1/2/2021",
      "1/2/2016",
      "1/2/2019",
      "1/2/2021",
      "1/2/2019",
      "1/2/2021"
    ),
    Date2 = c(
      "1/2/2015",
      "1/2/2018",
      "1/2/2020",
      "1/2/2023",
      "1/2/2018",
      "1/2/2020",
      "1/2/2023",
      "1/2/2020",
      "1/2/2023"
    )
  ),
  class = "data.frame",

Solution

First you need to convert the Date columns from strings to dates. Since they are in the m-d-YYYY format, you can use lubridate::mdy to do that.

library(dplyr)
library(lubridate)
Reference <- structure(
  list(
    ParticipantId = 10001:10003,
    DateA = c("3/12/2013", "5/15/2022", "9/20/2022")
  ),
  class = "data.frame",
  row.names = c(NA, -3L)
)

Target <- data.frame(
    ParticipantId = c(10001L, 10001L,10001L,10001L,10002L,10002L,10002L,10003L,10003L),
    Date1 = c("1/2/2010","1/2/2016","1/2/2019","1/2/2021","1/2/2016","1/2/2019","1/2/2021","1/2/2019","1/2/2021"),
    Date2 = c("1/2/2015","1/2/2018","1/2/2020","1/2/2023","1/2/2018","1/2/2020","1/2/2023","1/2/2020","1/2/2023"))

Target$Date1 <- mdy(Target$Date1)
Target$Date2 <- mdy(Target$Date2)
Reference$DateA <- mdy(Reference$DateA)

Next, join the data so that you have DataA corresponding to each Partipicant_ID in your Target data.

joined_data <- left_join(Target, Reference, by = join_by(ParticipantId)) 
print(joined_data)
  ParticipantId      Date1      Date2      DateA
1         10001 2010-01-02 2015-01-02 2013-03-12
2         10001 2016-01-02 2018-01-02 2013-03-12
3         10001 2019-01-02 2020-01-02 2013-03-12
4         10001 2021-01-02 2023-01-02 2013-03-12
5         10002 2016-01-02 2018-01-02 2022-05-15
6         10002 2019-01-02 2020-01-02 2022-05-15
7         10002 2021-01-02 2023-01-02 2022-05-15
8         10003 2019-01-02 2020-01-02 2022-09-20
9         10003 2021-01-02 2023-01-02 2022-09-20

Lastly, use dplyr::filter and dplyr::between to keep only records where DateA is between Date1 and Date2

joined_data_between_dates_1_and_2 <- filter(joined_data, between(DateA, Date1, Date2)) 
print(joined_data_between_dates_1_and_2)
 ParticipantId      Date1      Date2      DateA
1         10001 2010-01-02 2015-01-02 2013-03-12
2         10002 2021-01-02 2023-01-02 2022-05-15
3         10003 2021-01-02 2023-01-02 2022-09-20

Oh and if you don't want DateA in your final output you could add one more step:

joined_data_between_dates_1_and_2 <- select(joined_data_between_dates_1_and_2, -DateA)

If you're a bit more familiar with dplyr and 'the pipe', you could do it in one simple sequence like this:

library(dplyr)
library(lubridate)
Reference <- structure(
  list(
    ParticipantId = 10001:10003,
    DateA = c("3/12/2013", "5/15/2022", "9/20/2022")
  ),
  class = "data.frame",
  row.names = c(NA, -3L)
)

Target <- data.frame(
    ParticipantId = c(10001L, 10001L,10001L,10001L,10002L,10002L,10002L,10003L,10003L),
    Date1 = c("1/2/2010","1/2/2016","1/2/2019","1/2/2021","1/2/2016","1/2/2019","1/2/2021","1/2/2019","1/2/2021"),
    Date2 = c("1/2/2015","1/2/2018","1/2/2020","1/2/2023","1/2/2018","1/2/2020","1/2/2023","1/2/2020","1/2/2023"))

Reference <- Reference |> mutate(DateA = mdy(DateA))

joined_data_between_dates_1_and_2 <- Target |> 
                                     mutate(Date1 = mdy(Date1), Date2 = mdy(Date2)) |> 
                                     left_join(Reference, by = join_by(ParticipantId)) |> 
                                     filter(between(DateA, Date1, Date2)) |> 
                                     select(-DateA)

If Target and Reference are very big, you can use the R arrow package and the paruqet format to speed things up and only load the full results into memory once it's finished.

library(dplyr)
library(lubridate)
library(arrow)
set.seed(123) 
# Function to generate 'n' random dates between a start and end date
generate_random_dates <- function(n, start_date, end_date) {
  start <- as.Date(start_date)
  end <- as.Date(end_date)
  random_dates <- start + runif(n, 0, as.numeric(difftime(end, start, units = "days")))
  return(random_dates)
}



#Generate Reference data.frame with 10 million ParticipantIds
Reference <- data.frame(
  ParticipantId = 1:(10^7),
  DateA = generate_random_dates(10^7, "2000-01-01", "2024-12-31")
)

#Generate Target data.frame with 100 million rows
Target <- data.frame(
  ParticipantId = sample(1:(10^7), 10^8, replace = TRUE),
  Date1 = generate_random_dates(10^8, "2000-01-01", "2024-12-31"),
  Date2 = generate_random_dates(10^8, "2000-01-01", "2024-12-31")
)

#Takes about 12-13 seconds on my computer. 
(Removed step to concert string dates to Dates 
# because the data.frames I made already are in that format).
system.time(Target |> 
  left_join(Reference, by = join_by(ParticipantId)) |> 
  filter(between(DateA, Date1, Date2)) |> 
  select(-DateA))



user  system elapsed 
12.19    0.63   12.82


#Save Reference and Target as arrow parquet files.
write_dataset(Reference, path = "reference", format = "parquet")
write_dataset(Target, path = "target", format = "parquet")

#Load arrow parquet files
target_arrow <- open_dataset("target")
reference_arrow <- open_dataset("reference")


system.time(target_arrow |> 
              left_join(reference_arrow, by = join_by(ParticipantId)) |> 
              filter(between(DateA, Date1, Date2)) |> 
              select(-DateA) |> 
              collect())

#About twice as fast
user  system elapsed 
7.58    0.37    6.29