Search code examples
rdataframedplyrdata.tableoverlap

Finding overlapped units based on start and end times


I am wondering if there was an efficient solution for the following problem.

Suppose, that I have the following dataset:

library(data.table)


DT <- data.table(emp = c(1,2,3),
                 start_time = c(90,90,540),
                 duration = c(480, 480,480 ))

DT[, end_time := start_time + duration]

which looks like:

     emp   start_time duration end_time
   <num>      <num>    <num>    <num>
1:     1         90      480      570
2:     2         90      480      570
3:     3        540      480     1020

Here, emp is the employee id, and the start time, duration, and end times of each employee's shift are given by the three columns. I am attempting to determine the amount of overlap that each employee has with each other in minutes. Thus, the output should look something like:

     emp emp_1 emp_2 emp_3
   <num> <num> <num> <num>
1:     1   480   480    30
2:     2   480   480    30
3:     3    30    30   480

where the columns are based on the full set of employees.

I am looking for a data.table solution since the number of employees is quite large.


Solution

  • Using pmin and pmax with my method of cross-join is the fastest; however, the other answer handles the exceptions better, and it is not far behind mine in terms of performance.

    dcast(setkey(DT[,c(k=1,.SD)],k)[
      setNames(DT, paste0(names(DT), '_2'))[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL][ , 
              overlap :=  pmin(end_time,end_time_2)-pmax(start_time,start_time_2)],
      emp~ emp_2, value.var = "overlap")
    

    Benchmark:

    Here I made a larger dataset to test different approaches, including tidyverse ones. I had to create copies of the datatable to account for setting the key in each of the solutions, although it doesn't affect the benchmark considerably;

    library(data.table)
    library(DescTools)
    library(dplyr)
    library(tidyr)
    
    set.seed(123)
    DT <- data.table(emp = 1:100,
                     start_time = sample.int(1000, 100),
                     duration = sample.int(1000, 100) + 1000)
    
    DT[, end_time := start_time + duration]
    
    DT2 <- copy(DT)
    DT3 <- copy(DT)
    DT4 <- copy(DT)
    
    
    WvM <- microbenchmark::microbenchmark(
    
    M_DT_Desc = dcast(setkey(DT2[,c(k=1,.SD)],k)[
          setNames(DT2, paste0(names(DT2), '_2'))[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL][ , 
             overlap :=  Overlap(c(start_time , end_time), c(start_time_2, end_time_2)), 
                                                                                by = 1:NROW(DT2)^2],
              emp~ emp_2, value.var = "overlap"),
    
    
    M_DT_pminmax = dcast(setkey(DT2[,c(k=1,.SD)],k)[
      setNames(DT2, paste0(names(DT2), '_2'))[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL][ , 
              overlap :=  pmin(end_time,end_time_2)-pmax(start_time,start_time_2)],
      emp~ emp_2, value.var = "overlap"),
    
    
    M_foverlap_Desc = {setkey(DT3,start_time,end_time);dcast(foverlaps(DT3,DT3)[, irow := .I][,
                        overlap :=  Overlap(c(start_time , end_time), c(i.start_time, i.end_time)), 
                                                                                  by = irow],
                emp~i.emp,value.var = "overlap")},
    
    
    M_dplyr_Desc = DT4 %>% 
      setNames(paste0(names(.), '_2')) %>% 
      crossing(DT4, .) %>% 
      rowwise() %>%
      mutate(overlap = Overlap(c(start_time , end_time), c(start_time_2, end_time_2))) %>%
      ungroup() %>% 
      pivot_wider(id_cols = "emp", names_from = "emp_2", values_from = "overlap"),
    
    
    M_dplyr_pminmax = DT4 %>% 
      setNames(paste0(names(.), '_2')) %>% 
      crossing(DT4, .) %>% 
      rowwise() %>%
      mutate(overlap = pmin(end_time,end_time_2)-pmax(start_time,start_time_2)) %>%
      ungroup() %>% 
      pivot_wider(id_cols = "emp", names_from = "emp_2", values_from = "overlap"),
    
    
    Waldi ={setkey(DT,start_time,end_time); dcast(foverlaps(DT,DT)[,
                      ol:=pmin(end_time,i.end_time)-pmax(start_time,i.start_time)],
          emp~i.emp,value.var = "ol")},
    
    times = 10)
    
    Unit: milliseconds
                expr       min        lq       mean    median        uq       max neval
           M_DT_Desc  967.6728  992.4321 1063.98096 1053.6871 1093.5663 1258.7258    10
        M_DT_pminmax    7.3910    8.3103    8.86385    8.4347    9.8666   10.5503    10
     M_foverlap_Desc  966.2051 1001.8745 1043.72299 1034.6016 1095.6339 1128.2970    10
        M_dplyr_Desc 1040.0847 1060.8663 1132.24239 1101.4212 1150.1816 1444.9537    10
     M_dplyr_pminmax  168.4051  172.5951  185.10149  179.1346  197.1055  223.4941    10
               Waldi    8.5117    9.3202   10.54267    9.6550   10.2424   17.6923    10
    

    Original Answer:

    Here's another approach for cross join and getting the overlaps using DescTools package.

    library(data.table)
    library(DescTools)
    
    dcast(setkey(DT[,c(k=1,.SD)],k)[
            setNames(DT, paste0(names(DT), '_2'))[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL][ , 
                  overlap :=  DescTools::Overlap(c(start_time , end_time), c(start_time_2, end_time_2)), 
                    by = 1:NROW(DT)^2],
          emp~ emp_2, value.var = "overlap")