I've written the following R code, but it takes a long time to run on large datasets (1 million records). Is there a solution that would be faster?
# Define start and end dates
start <- as.Date("2023-01-01")
end <- as.Date("2023-03-31")
create_dataframe <- function(input){
createcolumns <- function(row) {
#Create date range of report
report_range = seq.Date(from = start,
to = end,
by = "month")
#Create date range of caseprsn2
input_row_range = seq.Date(from = input[row,3],
to = input[row,4],
by = "month")
start_day = as.Date(intersect(as.character(report_range),as.character(input_row_range)))
end_day = ceiling_date(start_day, "month") - days(1)
df = data.frame(start_day, end_day)
replicated_rows <- as.data.frame(lapply(input[row,], rep, nrow(df)))
cbind(df, replicated_rows)
} #helper func to create BegMo & EndMo
output <- data.frame() # creates an empty data frame
output <- do.call(rbind, lapply(1:nrow(input), createcolumns))
return(output)
}
The function createcolumns
is run on every row in the input dataframe to create a new dataframe. The logic is that based on the report_range, the function checks to see the overlap of report_range and input_row_range. One new row is created for every overlap that exists. All other columns are replicated.
Code to reproduce input dataset:
example_df = structure(list(CaseNumber = c("00000001", "00000001", "00000002",
"00000002", "00000003", "00000004", "00000005", "00000006", "00000006",
"00000006"), ProgramID = c("FS", "FS", "FS", "FS", "FS", "FS",
"FS", "FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389,
19297, 19417, 19358, 19358, 19358, 19358, 19389, 19417), class = "Date"),
DerivedEndDate = structure(c(19388, 19508, 19416, 19447,
19508, 19388, 19508, 19388, 19416, 19508), class = "Date")), class = "data.frame", row.names = c(NA,
-10L))
Code to reproduce output dataset:
example_df_transformed = structure(list(start_day = structure(c(19358, 19389, 19417, 19358,
19389, 19417, 19358, 19389, 19417, 19358, 19358, 19389, 19417,
19358, 19389, 19417), class = "Date"), end_day = structure(c(19388,
19416, 19447, 19388, 19416, 19447, 19388, 19416, 19447, 19388,
19388, 19416, 19447, 19388, 19416, 19447), class = "Date"), CaseNumber = c("00000001",
"00000001", "00000001", "00000002", "00000002", "00000002", "00000003",
"00000003", "00000003", "00000004", "00000005", "00000005", "00000005",
"00000006", "00000006", "00000006"), ProgramID = c("FS", "FS",
"FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS",
"FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389, 19389,
19297, 19297, 19417, 19358, 19358, 19358, 19358, 19358, 19358,
19358, 19358, 19389, 19417), class = "Date"), DerivedEndDate = structure(c(19388,
19508, 19508, 19416, 19416, 19447, 19508, 19508, 19508, 19388,
19508, 19508, 19508, 19388, 19416, 19508), class = "Date")), row.names = c(NA,
-16L), class = "data.frame")
Not sure what to think of that for-loop, while it doesn't add anything, for 1 million records it generates 1e6(from lapply
) * 1e6(from for
) iterations and that indeed takes a long time.
Here's a dplyr-based attempt, performance-wise it probably makes sense to go with data.table
, but with few minutes for 1m records it should still be usable. Test runs with single-shot tic-toc timing included.
library(dplyr)
library(tidyr)
library(purrr)
library(lubridate)
# Define start and end dates
start <- as.Date("2023-01-01")
end <- as.Date("2023-03-31")
create_dataframe <- function(df_, start_, end_){
report_range = seq.Date(from = start_,
to = end_,
by = "month")
# rowwise for intersect(seq.Date( ... ))
df_ %>% rowwise() %>%
# add intersection of "report_range" and "input_row" to a list-column
mutate(start_day = list(intersect(seq.Date(from = BenefitMonth,
to = DerivedEndDate,
by = "month"),
report_range)), .before = 1) %>%
ungroup() %>%
# unnest_longer no a list column, each line will be repeated for each
# item in a list
unnest_longer(start_day) %>%
mutate(
# unnest somehow drops the correct type, set it back
start_day = as.Date(start_day, origin = "1970-01-01"),
end_day = ceiling_date(start_day, "month") - days(1), .after = 1)
}
# test with 10x1000 rows
example_xl <- rep_len(list(example_df),1000) %>% list_rbind()
nrow(example_xl)
#> [1] 10000
tictoc::tic("updated, 10 000 rows")
out1 <- create_dataframe(example_xl, start, end)
tictoc::toc()
#> updated, 10 000 rows: 1.56 sec elapsed
out1[1:16,]
#> # A tibble: 16 × 6
#> start_day end_day CaseNumber ProgramID BenefitMonth DerivedEndDate
#> <date> <date> <chr> <chr> <date> <date>
#> 1 2023-01-01 2023-01-31 00000001 FS 2023-01-01 2023-01-31
#> 2 2023-02-01 2023-02-28 00000001 FS 2023-02-01 2023-05-31
#> 3 2023-03-01 2023-03-31 00000001 FS 2023-02-01 2023-05-31
#> 4 2023-01-01 2023-01-31 00000002 FS 2022-11-01 2023-02-28
#> 5 2023-02-01 2023-02-28 00000002 FS 2022-11-01 2023-02-28
#> 6 2023-03-01 2023-03-31 00000002 FS 2023-03-01 2023-03-31
#> 7 2023-01-01 2023-01-31 00000003 FS 2023-01-01 2023-05-31
#> 8 2023-02-01 2023-02-28 00000003 FS 2023-01-01 2023-05-31
#> 9 2023-03-01 2023-03-31 00000003 FS 2023-01-01 2023-05-31
#> 10 2023-01-01 2023-01-31 00000004 FS 2023-01-01 2023-01-31
#> 11 2023-01-01 2023-01-31 00000005 FS 2023-01-01 2023-05-31
#> 12 2023-02-01 2023-02-28 00000005 FS 2023-01-01 2023-05-31
#> 13 2023-03-01 2023-03-31 00000005 FS 2023-01-01 2023-05-31
#> 14 2023-01-01 2023-01-31 00000006 FS 2023-01-01 2023-01-31
#> 15 2023-02-01 2023-02-28 00000006 FS 2023-02-01 2023-02-28
#> 16 2023-03-01 2023-03-31 00000006 FS 2023-03-01 2023-05-31
tictoc::tic("original, 10 000 rows")
# (with outer loop disabled)
out2 <- create_dataframe_orig(example_xl)
tictoc::toc()
#> original, 10 000 rows: 31.39 sec elapsed
# test with 10x100000 rows
example_xl <- rep_len(list(example_df),1e5) %>% list_rbind()
nrow(example_xl)
#> [1] 1000000
tictoc::tic("updated, 1000 000 rows")
out3 <- create_dataframe(example_xl, start, end)
tictoc::toc()
#> updated, 1000 000 rows: 154.61 sec elapsed
nrow(out3)
#> [1] 1600000
Input data & function from question with for-loop disabled:
example_df = structure(list(CaseNumber = c("00000001", "00000001", "00000002",
"00000002", "00000003", "00000004", "00000005", "00000006", "00000006",
"00000006"), ProgramID = c("FS", "FS", "FS", "FS", "FS", "FS",
"FS", "FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389,
19297, 19417, 19358, 19358, 19358, 19358, 19389, 19417), class = "Date"),
DerivedEndDate = structure(c(19388, 19508, 19416, 19447,
19508, 19388, 19508, 19388, 19416, 19508), class = "Date")), class = "data.frame", row.names = c(NA,
-10L))
example_df_transformed = structure(list(start_day = structure(c(19358, 19389, 19417, 19358,
19389, 19417, 19358, 19389, 19417, 19358, 19358, 19389, 19417,
19358, 19389, 19417), class = "Date"), end_day = structure(c(19388,
19416, 19447, 19388, 19416, 19447, 19388, 19416, 19447, 19388,
19388, 19416, 19447, 19388, 19416, 19447), class = "Date"), CaseNumber = c("00000001",
"00000001", "00000001", "00000002", "00000002", "00000002", "00000003",
"00000003", "00000003", "00000004", "00000005", "00000005", "00000005",
"00000006", "00000006", "00000006"), ProgramID = c("FS", "FS",
"FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS",
"FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389, 19389,
19297, 19297, 19417, 19358, 19358, 19358, 19358, 19358, 19358,
19358, 19358, 19389, 19417), class = "Date"), DerivedEndDate = structure(c(19388,
19508, 19508, 19416, 19416, 19447, 19508, 19508, 19508, 19388,
19508, 19508, 19508, 19388, 19416, 19508), class = "Date")), row.names = c(NA,
-16L), class = "data.frame")
create_dataframe_orig <- function(input){
createcolumns <- function(row) {
#Create date range of report
report_range = seq.Date(from = start,
to = end,
by = "month")
#Create date range of caseprsn2
input_row_range = seq.Date(from = input[row,3],
to = input[row,4],
by = "month")
start_day = as.Date(intersect(as.character(report_range),as.character(input_row_range)))
end_day = ceiling_date(start_day, "month") - days(1)
df = data.frame(start_day, end_day)
replicated_rows <- as.data.frame(lapply(input[row,], rep, nrow(df)))
cbind(df, replicated_rows)
} #helper func to create BegMo & EndMo
output <- data.frame() # creates an empty data frame
output <- do.call(rbind, lapply(1:nrow(input), createcolumns))
# for (row in 1:nrow(input)){
# }
return(output)
}
Created on 2023-05-12 with reprex v2.0.2