I'm looking to filter the data in one dataframe (Target
) based of whether or not a value in another dataframe (Reference
) falls between two dates in the Target
dataframe.
For example, this would be my raw data (Target
):
ParticipantId | Date1 | Date2 |
---|---|---|
10001 | 1/02/2010 | 1/02/2015 |
10001 | 3/02/2016 | 1/02/2018 |
10001 | 1/02/2019 | 1/02/2020 |
10001 | 1/02/2021 | 1/02/2023 |
10002 | 1/02/2016 | 1/02/2018 |
10002 | 1/02/2019 | 1/02/2020 |
10002 | 1/02/2021 | 1/02/2023 |
10003 | 1/02/2013 | 1/02/2020 |
10003 | 1/02/2021 | 1/02/2023 |
And this would be my reference data (Reference
):
ParticipantId | DateA |
---|---|
10001 | 3/12/2013 |
10002 | 5/15/2022 |
10003 | 9/20/2022 |
What I would like is a filtered output of of Target
where DateA
in Reference
falls between Date1
and Date2
in Target
like so:
ParticipantId | Date1 | Date2 |
---|---|---|
10001 | 1/02/2010 | 1/02/2015 |
10002 | 1/02/2021 | 1/02/2023 |
10003 | 1/02/2021 | 1/02/2023 |
If anyone would be able to provide some input as to how this could be done, preferably without using a join as the real Target
is too big to load into memory using dplyr
with pipes, I would greatly appreciate it.
Code to make the dataframes may be found below but you may need to load the lubridate
library.
Reference <- structure(
list(
ParticipantId = 10001:10003,
DateA = c("3/12/2013", "5/15/2022", "9/20/2022")
),
class = "data.frame",
row.names = c(NA, -3L)
)
Target <- structure(
list(
ParticipantId = c(
10001L,
10001L,
10001L,
10001L,
10002L,
10002L,
10002L,
10003L,
10003L
),
Date1 = c(
"1/2/2010",
"1/2/2016",
"1/2/2019",
"1/2/2021",
"1/2/2016",
"1/2/2019",
"1/2/2021",
"1/2/2019",
"1/2/2021"
),
Date2 = c(
"1/2/2015",
"1/2/2018",
"1/2/2020",
"1/2/2023",
"1/2/2018",
"1/2/2020",
"1/2/2023",
"1/2/2020",
"1/2/2023"
)
),
class = "data.frame",
First you need to convert the Date columns from strings to dates. Since they are in the m-d-YYYY format, you can use
lubridate::mdy
to do that.
library(dplyr)
library(lubridate)
Reference <- structure(
list(
ParticipantId = 10001:10003,
DateA = c("3/12/2013", "5/15/2022", "9/20/2022")
),
class = "data.frame",
row.names = c(NA, -3L)
)
Target <- data.frame(
ParticipantId = c(10001L, 10001L,10001L,10001L,10002L,10002L,10002L,10003L,10003L),
Date1 = c("1/2/2010","1/2/2016","1/2/2019","1/2/2021","1/2/2016","1/2/2019","1/2/2021","1/2/2019","1/2/2021"),
Date2 = c("1/2/2015","1/2/2018","1/2/2020","1/2/2023","1/2/2018","1/2/2020","1/2/2023","1/2/2020","1/2/2023"))
Target$Date1 <- mdy(Target$Date1)
Target$Date2 <- mdy(Target$Date2)
Reference$DateA <- mdy(Reference$DateA)
Next, join the data so that you have DataA corresponding to each Partipicant_ID in your Target data.
joined_data <- left_join(Target, Reference, by = join_by(ParticipantId))
print(joined_data)
ParticipantId Date1 Date2 DateA
1 10001 2010-01-02 2015-01-02 2013-03-12
2 10001 2016-01-02 2018-01-02 2013-03-12
3 10001 2019-01-02 2020-01-02 2013-03-12
4 10001 2021-01-02 2023-01-02 2013-03-12
5 10002 2016-01-02 2018-01-02 2022-05-15
6 10002 2019-01-02 2020-01-02 2022-05-15
7 10002 2021-01-02 2023-01-02 2022-05-15
8 10003 2019-01-02 2020-01-02 2022-09-20
9 10003 2021-01-02 2023-01-02 2022-09-20
Lastly, use dplyr::filter and dplyr::between to keep only records where DateA is between Date1 and Date2
joined_data_between_dates_1_and_2 <- filter(joined_data, between(DateA, Date1, Date2))
print(joined_data_between_dates_1_and_2)
ParticipantId Date1 Date2 DateA
1 10001 2010-01-02 2015-01-02 2013-03-12
2 10002 2021-01-02 2023-01-02 2022-05-15
3 10003 2021-01-02 2023-01-02 2022-09-20
Oh and if you don't want DateA in your final output you could add one more step:
joined_data_between_dates_1_and_2 <- select(joined_data_between_dates_1_and_2, -DateA)
If you're a bit more familiar with dplyr and 'the pipe', you could do it in one simple sequence like this:
library(dplyr)
library(lubridate)
Reference <- structure(
list(
ParticipantId = 10001:10003,
DateA = c("3/12/2013", "5/15/2022", "9/20/2022")
),
class = "data.frame",
row.names = c(NA, -3L)
)
Target <- data.frame(
ParticipantId = c(10001L, 10001L,10001L,10001L,10002L,10002L,10002L,10003L,10003L),
Date1 = c("1/2/2010","1/2/2016","1/2/2019","1/2/2021","1/2/2016","1/2/2019","1/2/2021","1/2/2019","1/2/2021"),
Date2 = c("1/2/2015","1/2/2018","1/2/2020","1/2/2023","1/2/2018","1/2/2020","1/2/2023","1/2/2020","1/2/2023"))
Reference <- Reference |> mutate(DateA = mdy(DateA))
joined_data_between_dates_1_and_2 <- Target |>
mutate(Date1 = mdy(Date1), Date2 = mdy(Date2)) |>
left_join(Reference, by = join_by(ParticipantId)) |>
filter(between(DateA, Date1, Date2)) |>
select(-DateA)
If Target and Reference are very big, you can use the R arrow package and the paruqet format to speed things up and only load the full results into memory once it's finished.
library(dplyr)
library(lubridate)
library(arrow)
set.seed(123)
# Function to generate 'n' random dates between a start and end date
generate_random_dates <- function(n, start_date, end_date) {
start <- as.Date(start_date)
end <- as.Date(end_date)
random_dates <- start + runif(n, 0, as.numeric(difftime(end, start, units = "days")))
return(random_dates)
}
#Generate Reference data.frame with 10 million ParticipantIds
Reference <- data.frame(
ParticipantId = 1:(10^7),
DateA = generate_random_dates(10^7, "2000-01-01", "2024-12-31")
)
#Generate Target data.frame with 100 million rows
Target <- data.frame(
ParticipantId = sample(1:(10^7), 10^8, replace = TRUE),
Date1 = generate_random_dates(10^8, "2000-01-01", "2024-12-31"),
Date2 = generate_random_dates(10^8, "2000-01-01", "2024-12-31")
)
#Takes about 12-13 seconds on my computer.
(Removed step to concert string dates to Dates
# because the data.frames I made already are in that format).
system.time(Target |>
left_join(Reference, by = join_by(ParticipantId)) |>
filter(between(DateA, Date1, Date2)) |>
select(-DateA))
user system elapsed
12.19 0.63 12.82
#Save Reference and Target as arrow parquet files.
write_dataset(Reference, path = "reference", format = "parquet")
write_dataset(Target, path = "target", format = "parquet")
#Load arrow parquet files
target_arrow <- open_dataset("target")
reference_arrow <- open_dataset("reference")
system.time(target_arrow |>
left_join(reference_arrow, by = join_by(ParticipantId)) |>
filter(between(DateA, Date1, Date2)) |>
select(-DateA) |>
collect())
#About twice as fast
user system elapsed
7.58 0.37 6.29