Replicate Rows in R Based on Condition

I've written the following R code, but it takes a long time to run on large datasets (1 million records). Is there a solution that would be faster?

# Define start and end dates
start <- as.Date("2023-01-01")
end <- as.Date("2023-03-31")

create_dataframe <- function(input){
  createcolumns <- function(row) { 
    #Create date range of report
    report_range = seq.Date(from = start, 
                            to = end,
                            by = "month")
    #Create date range of caseprsn2
    input_row_range = seq.Date(from = input[row,3],
                               to = input[row,4],
                               by = "month")
    
    start_day = as.Date(intersect(as.character(report_range),as.character(input_row_range)))
    
    end_day = ceiling_date(start_day, "month") - days(1)
    
    df = data.frame(start_day, end_day)
    replicated_rows <- as.data.frame(lapply(input[row,], rep, nrow(df)))  
    cbind(df, replicated_rows)
  } #helper func to create BegMo & EndMo
  output <- data.frame() # creates an empty data frame
  output <- do.call(rbind, lapply(1:nrow(input), createcolumns))
  
  return(output)
}

The function createcolumns is run on every row in the input dataframe to create a new dataframe. The logic is that based on the report_range, the function checks to see the overlap of report_range and input_row_range. One new row is created for every overlap that exists. All other columns are replicated.

Code to reproduce input dataset:

example_df = structure(list(CaseNumber = c("00000001", "00000001", "00000002", 
"00000002", "00000003", "00000004", "00000005", "00000006", "00000006", 
"00000006"), ProgramID = c("FS", "FS", "FS", "FS", "FS", "FS", 
"FS", "FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389, 
19297, 19417, 19358, 19358, 19358, 19358, 19389, 19417), class = "Date"), 
    DerivedEndDate = structure(c(19388, 19508, 19416, 19447, 
    19508, 19388, 19508, 19388, 19416, 19508), class = "Date")), class = "data.frame", row.names = c(NA, 
-10L))

Code to reproduce output dataset:

example_df_transformed = structure(list(start_day = structure(c(19358, 19389, 19417, 19358, 
19389, 19417, 19358, 19389, 19417, 19358, 19358, 19389, 19417, 
19358, 19389, 19417), class = "Date"), end_day = structure(c(19388, 
19416, 19447, 19388, 19416, 19447, 19388, 19416, 19447, 19388, 
19388, 19416, 19447, 19388, 19416, 19447), class = "Date"), CaseNumber = c("00000001", 
"00000001", "00000001", "00000002", "00000002", "00000002", "00000003", 
"00000003", "00000003", "00000004", "00000005", "00000005", "00000005", 
"00000006", "00000006", "00000006"), ProgramID = c("FS", "FS", 
"FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", 
"FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389, 19389, 
19297, 19297, 19417, 19358, 19358, 19358, 19358, 19358, 19358, 
19358, 19358, 19389, 19417), class = "Date"), DerivedEndDate = structure(c(19388, 
19508, 19508, 19416, 19416, 19447, 19508, 19508, 19508, 19388, 
19508, 19508, 19508, 19388, 19416, 19508), class = "Date")), row.names = c(NA, 
-16L), class = "data.frame")

Solution

Not sure what to think of that for-loop, while it doesn't add anything, for 1 million records it generates 1e6(from lapply) * 1e6(from for) iterations and that indeed takes a long time.

Here's a dplyr-based attempt, performance-wise it probably makes sense to go with data.table, but with few minutes for 1m records it should still be usable. Test runs with single-shot tic-toc timing included.

library(dplyr)
library(tidyr)
library(purrr)
library(lubridate)

# Define start and end dates
start <- as.Date("2023-01-01")
end <- as.Date("2023-03-31")


create_dataframe <- function(df_, start_, end_){
  report_range = seq.Date(from = start_, 
                          to   = end_,
                          by   = "month")
  # rowwise for intersect(seq.Date( ... ))
  df_ %>% rowwise() %>%
  # add intersection of "report_range" and "input_row" to a list-column  
  mutate(start_day = list(intersect(seq.Date(from = BenefitMonth, 
                                             to = DerivedEndDate, 
                                             by = "month"),
                                    report_range)), .before = 1) %>% 
  ungroup() %>% 
  # unnest_longer no a list column, each line will be repeated for each
  # item in a list
  unnest_longer(start_day) %>% 
  mutate(
    # unnest somehow drops the correct type, set it back
    start_day = as.Date(start_day, origin = "1970-01-01"),
    end_day = ceiling_date(start_day, "month") - days(1), .after = 1)
}

# test with 10x1000 rows
example_xl <- rep_len(list(example_df),1000) %>% list_rbind()
nrow(example_xl)
#> [1] 10000

tictoc::tic("updated, 10 000 rows")
out1 <- create_dataframe(example_xl, start, end)
tictoc::toc()
#> updated, 10 000 rows: 1.56 sec elapsed
out1[1:16,]
#> # A tibble: 16 × 6
#>    start_day  end_day    CaseNumber ProgramID BenefitMonth DerivedEndDate
#>    <date>     <date>     <chr>      <chr>     <date>       <date>        
#>  1 2023-01-01 2023-01-31 00000001   FS        2023-01-01   2023-01-31    
#>  2 2023-02-01 2023-02-28 00000001   FS        2023-02-01   2023-05-31    
#>  3 2023-03-01 2023-03-31 00000001   FS        2023-02-01   2023-05-31    
#>  4 2023-01-01 2023-01-31 00000002   FS        2022-11-01   2023-02-28    
#>  5 2023-02-01 2023-02-28 00000002   FS        2022-11-01   2023-02-28    
#>  6 2023-03-01 2023-03-31 00000002   FS        2023-03-01   2023-03-31    
#>  7 2023-01-01 2023-01-31 00000003   FS        2023-01-01   2023-05-31    
#>  8 2023-02-01 2023-02-28 00000003   FS        2023-01-01   2023-05-31    
#>  9 2023-03-01 2023-03-31 00000003   FS        2023-01-01   2023-05-31    
#> 10 2023-01-01 2023-01-31 00000004   FS        2023-01-01   2023-01-31    
#> 11 2023-01-01 2023-01-31 00000005   FS        2023-01-01   2023-05-31    
#> 12 2023-02-01 2023-02-28 00000005   FS        2023-01-01   2023-05-31    
#> 13 2023-03-01 2023-03-31 00000005   FS        2023-01-01   2023-05-31    
#> 14 2023-01-01 2023-01-31 00000006   FS        2023-01-01   2023-01-31    
#> 15 2023-02-01 2023-02-28 00000006   FS        2023-02-01   2023-02-28    
#> 16 2023-03-01 2023-03-31 00000006   FS        2023-03-01   2023-05-31

tictoc::tic("original, 10 000 rows")
# (with outer loop disabled)
out2 <- create_dataframe_orig(example_xl)
tictoc::toc()
#> original, 10 000 rows: 31.39 sec elapsed

# test with 10x100000 rows
example_xl <- rep_len(list(example_df),1e5) %>% list_rbind()
nrow(example_xl)
#> [1] 1000000
tictoc::tic("updated, 1000 000 rows")
out3 <- create_dataframe(example_xl, start, end)
tictoc::toc()
#> updated, 1000 000 rows: 154.61 sec elapsed
nrow(out3)
#> [1] 1600000

Input data & function from question with for-loop disabled:

example_df = structure(list(CaseNumber = c("00000001", "00000001", "00000002", 
"00000002", "00000003", "00000004", "00000005", "00000006", "00000006", 
"00000006"), ProgramID = c("FS", "FS", "FS", "FS", "FS", "FS", 
"FS", "FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389, 
19297, 19417, 19358, 19358, 19358, 19358, 19389, 19417), class = "Date"), 
DerivedEndDate = structure(c(19388, 19508, 19416, 19447, 
19508, 19388, 19508, 19388, 19416, 19508), class = "Date")), class = "data.frame", row.names = c(NA, 
  -10L))
example_df_transformed = structure(list(start_day = structure(c(19358, 19389, 19417, 19358, 
19389, 19417, 19358, 19389, 19417, 19358, 19358, 19389, 19417, 
19358, 19389, 19417), class = "Date"), end_day = structure(c(19388, 
19416, 19447, 19388, 19416, 19447, 19388, 19416, 19447, 19388, 
19388, 19416, 19447, 19388, 19416, 19447), class = "Date"), CaseNumber = c("00000001", 
"00000001", "00000001", "00000002", "00000002", "00000002", "00000003", 
"00000003", "00000003", "00000004", "00000005", "00000005", "00000005", 
"00000006", "00000006", "00000006"), ProgramID = c("FS", "FS", 
"FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", 
"FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389, 19389, 
19297, 19297, 19417, 19358, 19358, 19358, 19358, 19358, 19358, 
19358, 19358, 19389, 19417), class = "Date"), DerivedEndDate = structure(c(19388, 
19508, 19508, 19416, 19416, 19447, 19508, 19508, 19508, 19388, 
19508, 19508, 19508, 19388, 19416, 19508), class = "Date")), row.names = c(NA, 
-16L), class = "data.frame")

create_dataframe_orig <- function(input){
  createcolumns <- function(row) { 
    #Create date range of report
    report_range = seq.Date(from = start, 
                            to = end,
                            by = "month")
    #Create date range of caseprsn2
    input_row_range = seq.Date(from = input[row,3],
                               to = input[row,4],
                               by = "month")
    
    start_day = as.Date(intersect(as.character(report_range),as.character(input_row_range)))
    
    end_day = ceiling_date(start_day, "month") - days(1)
    
    df = data.frame(start_day, end_day)
    replicated_rows <- as.data.frame(lapply(input[row,], rep, nrow(df)))  
    cbind(df, replicated_rows)
  } #helper func to create BegMo & EndMo
  output <- data.frame() # creates an empty data frame
  output <- do.call(rbind, lapply(1:nrow(input), createcolumns))
  # for (row in 1:nrow(input)){
  # }
  return(output)
}

^{Created on 2023-05-12 with reprex v2.0.2}