Search code examples
rdplyrdatatabletidyverse

Replicate Rows in R Based on Condition


I've written the following R code, but it takes a long time to run on large datasets (1 million records). Is there a solution that would be faster?

# Define start and end dates
start <- as.Date("2023-01-01")
end <- as.Date("2023-03-31")

create_dataframe <- function(input){
  createcolumns <- function(row) { 
    #Create date range of report
    report_range = seq.Date(from = start, 
                            to = end,
                            by = "month")
    #Create date range of caseprsn2
    input_row_range = seq.Date(from = input[row,3],
                               to = input[row,4],
                               by = "month")
    
    start_day = as.Date(intersect(as.character(report_range),as.character(input_row_range)))
    
    end_day = ceiling_date(start_day, "month") - days(1)
    
    df = data.frame(start_day, end_day)
    replicated_rows <- as.data.frame(lapply(input[row,], rep, nrow(df)))  
    cbind(df, replicated_rows)
  } #helper func to create BegMo & EndMo
  output <- data.frame() # creates an empty data frame
  output <- do.call(rbind, lapply(1:nrow(input), createcolumns))
  
  return(output)
}

The function createcolumns is run on every row in the input dataframe to create a new dataframe. The logic is that based on the report_range, the function checks to see the overlap of report_range and input_row_range. One new row is created for every overlap that exists. All other columns are replicated.

Code to reproduce input dataset:

example_df = structure(list(CaseNumber = c("00000001", "00000001", "00000002", 
"00000002", "00000003", "00000004", "00000005", "00000006", "00000006", 
"00000006"), ProgramID = c("FS", "FS", "FS", "FS", "FS", "FS", 
"FS", "FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389, 
19297, 19417, 19358, 19358, 19358, 19358, 19389, 19417), class = "Date"), 
    DerivedEndDate = structure(c(19388, 19508, 19416, 19447, 
    19508, 19388, 19508, 19388, 19416, 19508), class = "Date")), class = "data.frame", row.names = c(NA, 
-10L))

Code to reproduce output dataset:

example_df_transformed = structure(list(start_day = structure(c(19358, 19389, 19417, 19358, 
19389, 19417, 19358, 19389, 19417, 19358, 19358, 19389, 19417, 
19358, 19389, 19417), class = "Date"), end_day = structure(c(19388, 
19416, 19447, 19388, 19416, 19447, 19388, 19416, 19447, 19388, 
19388, 19416, 19447, 19388, 19416, 19447), class = "Date"), CaseNumber = c("00000001", 
"00000001", "00000001", "00000002", "00000002", "00000002", "00000003", 
"00000003", "00000003", "00000004", "00000005", "00000005", "00000005", 
"00000006", "00000006", "00000006"), ProgramID = c("FS", "FS", 
"FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", 
"FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389, 19389, 
19297, 19297, 19417, 19358, 19358, 19358, 19358, 19358, 19358, 
19358, 19358, 19389, 19417), class = "Date"), DerivedEndDate = structure(c(19388, 
19508, 19508, 19416, 19416, 19447, 19508, 19508, 19508, 19388, 
19508, 19508, 19508, 19388, 19416, 19508), class = "Date")), row.names = c(NA, 
-16L), class = "data.frame")

Solution

  • Not sure what to think of that for-loop, while it doesn't add anything, for 1 million records it generates 1e6(from lapply) * 1e6(from for) iterations and that indeed takes a long time.

    Here's a dplyr-based attempt, performance-wise it probably makes sense to go with data.table, but with few minutes for 1m records it should still be usable. Test runs with single-shot tic-toc timing included.

    library(dplyr)
    library(tidyr)
    library(purrr)
    library(lubridate)
    
    # Define start and end dates
    start <- as.Date("2023-01-01")
    end <- as.Date("2023-03-31")
    
    
    create_dataframe <- function(df_, start_, end_){
      report_range = seq.Date(from = start_, 
                              to   = end_,
                              by   = "month")
      # rowwise for intersect(seq.Date( ... ))
      df_ %>% rowwise() %>%
      # add intersection of "report_range" and "input_row" to a list-column  
      mutate(start_day = list(intersect(seq.Date(from = BenefitMonth, 
                                                 to = DerivedEndDate, 
                                                 by = "month"),
                                        report_range)), .before = 1) %>% 
      ungroup() %>% 
      # unnest_longer no a list column, each line will be repeated for each
      # item in a list
      unnest_longer(start_day) %>% 
      mutate(
        # unnest somehow drops the correct type, set it back
        start_day = as.Date(start_day, origin = "1970-01-01"),
        end_day = ceiling_date(start_day, "month") - days(1), .after = 1)
    }
    
    # test with 10x1000 rows
    example_xl <- rep_len(list(example_df),1000) %>% list_rbind()
    nrow(example_xl)
    #> [1] 10000
    
    tictoc::tic("updated, 10 000 rows")
    out1 <- create_dataframe(example_xl, start, end)
    tictoc::toc()
    #> updated, 10 000 rows: 1.56 sec elapsed
    out1[1:16,]
    #> # A tibble: 16 × 6
    #>    start_day  end_day    CaseNumber ProgramID BenefitMonth DerivedEndDate
    #>    <date>     <date>     <chr>      <chr>     <date>       <date>        
    #>  1 2023-01-01 2023-01-31 00000001   FS        2023-01-01   2023-01-31    
    #>  2 2023-02-01 2023-02-28 00000001   FS        2023-02-01   2023-05-31    
    #>  3 2023-03-01 2023-03-31 00000001   FS        2023-02-01   2023-05-31    
    #>  4 2023-01-01 2023-01-31 00000002   FS        2022-11-01   2023-02-28    
    #>  5 2023-02-01 2023-02-28 00000002   FS        2022-11-01   2023-02-28    
    #>  6 2023-03-01 2023-03-31 00000002   FS        2023-03-01   2023-03-31    
    #>  7 2023-01-01 2023-01-31 00000003   FS        2023-01-01   2023-05-31    
    #>  8 2023-02-01 2023-02-28 00000003   FS        2023-01-01   2023-05-31    
    #>  9 2023-03-01 2023-03-31 00000003   FS        2023-01-01   2023-05-31    
    #> 10 2023-01-01 2023-01-31 00000004   FS        2023-01-01   2023-01-31    
    #> 11 2023-01-01 2023-01-31 00000005   FS        2023-01-01   2023-05-31    
    #> 12 2023-02-01 2023-02-28 00000005   FS        2023-01-01   2023-05-31    
    #> 13 2023-03-01 2023-03-31 00000005   FS        2023-01-01   2023-05-31    
    #> 14 2023-01-01 2023-01-31 00000006   FS        2023-01-01   2023-01-31    
    #> 15 2023-02-01 2023-02-28 00000006   FS        2023-02-01   2023-02-28    
    #> 16 2023-03-01 2023-03-31 00000006   FS        2023-03-01   2023-05-31
    
    tictoc::tic("original, 10 000 rows")
    # (with outer loop disabled)
    out2 <- create_dataframe_orig(example_xl)
    tictoc::toc()
    #> original, 10 000 rows: 31.39 sec elapsed
    
    # test with 10x100000 rows
    example_xl <- rep_len(list(example_df),1e5) %>% list_rbind()
    nrow(example_xl)
    #> [1] 1000000
    tictoc::tic("updated, 1000 000 rows")
    out3 <- create_dataframe(example_xl, start, end)
    tictoc::toc()
    #> updated, 1000 000 rows: 154.61 sec elapsed
    nrow(out3)
    #> [1] 1600000
    

    Input data & function from question with for-loop disabled:

    example_df = structure(list(CaseNumber = c("00000001", "00000001", "00000002", 
    "00000002", "00000003", "00000004", "00000005", "00000006", "00000006", 
    "00000006"), ProgramID = c("FS", "FS", "FS", "FS", "FS", "FS", 
    "FS", "FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389, 
    19297, 19417, 19358, 19358, 19358, 19358, 19389, 19417), class = "Date"), 
    DerivedEndDate = structure(c(19388, 19508, 19416, 19447, 
    19508, 19388, 19508, 19388, 19416, 19508), class = "Date")), class = "data.frame", row.names = c(NA, 
      -10L))
    example_df_transformed = structure(list(start_day = structure(c(19358, 19389, 19417, 19358, 
    19389, 19417, 19358, 19389, 19417, 19358, 19358, 19389, 19417, 
    19358, 19389, 19417), class = "Date"), end_day = structure(c(19388, 
    19416, 19447, 19388, 19416, 19447, 19388, 19416, 19447, 19388, 
    19388, 19416, 19447, 19388, 19416, 19447), class = "Date"), CaseNumber = c("00000001", 
    "00000001", "00000001", "00000002", "00000002", "00000002", "00000003", 
    "00000003", "00000003", "00000004", "00000005", "00000005", "00000005", 
    "00000006", "00000006", "00000006"), ProgramID = c("FS", "FS", 
    "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", "FS", 
    "FS", "FS", "FS"), BenefitMonth = structure(c(19358, 19389, 19389, 
    19297, 19297, 19417, 19358, 19358, 19358, 19358, 19358, 19358, 
    19358, 19358, 19389, 19417), class = "Date"), DerivedEndDate = structure(c(19388, 
    19508, 19508, 19416, 19416, 19447, 19508, 19508, 19508, 19388, 
    19508, 19508, 19508, 19388, 19416, 19508), class = "Date")), row.names = c(NA, 
    -16L), class = "data.frame")
    
    create_dataframe_orig <- function(input){
      createcolumns <- function(row) { 
        #Create date range of report
        report_range = seq.Date(from = start, 
                                to = end,
                                by = "month")
        #Create date range of caseprsn2
        input_row_range = seq.Date(from = input[row,3],
                                   to = input[row,4],
                                   by = "month")
        
        start_day = as.Date(intersect(as.character(report_range),as.character(input_row_range)))
        
        end_day = ceiling_date(start_day, "month") - days(1)
        
        df = data.frame(start_day, end_day)
        replicated_rows <- as.data.frame(lapply(input[row,], rep, nrow(df)))  
        cbind(df, replicated_rows)
      } #helper func to create BegMo & EndMo
      output <- data.frame() # creates an empty data frame
      output <- do.call(rbind, lapply(1:nrow(input), createcolumns))
      # for (row in 1:nrow(input)){
      # }
      return(output)
    }
    

    Created on 2023-05-12 with reprex v2.0.2