Search code examples
rdata.tablebenchmarkingperformancecounter

get elapsed time to most recent condition by group. Bench


I would like to speed up this code in data.table. I think I can do better. It is a typical looking backwards for the last TRUE predicate an then accounts the time to the current row. All this by group. I am running this with a 300.000 rows and 200.000 groups, and need to calculate many columns (metrics) like that. Hence, the velocity is important to me.

I made a smaller data Example:

 data <- data.table(
      SYSKEY = c(
        12, 13, 14, 15, 20, 
        22, 21, 24, 25, 26
      ), 
      Customer = c(
        "John", "John", "John", "Tom", "Tom", 
        "Tom", "Sally", "Sally", "Sally", "Sally"
      ), 
      TRAN_DATTIM = as.Date(
        c(
        "28-02-2024", "28-02-2024", "02-03-2024", "02-03-2024", "02-03-2024", 
        "02-03-2024", "02-03-2024", "02-03-2024", "03-03-2024", "03-03-2024"
        ),
        format="%d-%m-%Y", origin="01-01-1900"
      ), 
      Product = c(
        "Eggs", "Milk", "Bread", "Butter","Eggs", 
        "Milk", "Bread", "Butter", "Eggs", "Wine"
      )
  )

My function and execution code:

library(data.table)    

build_recency <- function(
    data,
    name,
    predicate,
    aggregated_fun = "TIME",
    gamma = 0.0001,
    rolling_over
){
  
  UseMethod("build_recency")
  
}


build_recency.data.table <- function(
    data,
    name,
    predicate,
    aggregated_fun = "TIME",
    gamma = 0.01,
    rolling_over = "PAN"
){

  predicate <- enexpr(predicate)

  data[,
    PRED := fifelse(eval(predicate) == TRUE, 1L, 0L)
  ]
  
  setorderv(
    data,
    c(rolling_over, "TRAN_DATTIM", "SYSKEY")
  )
  
 # make a left-join, that trail last TRUE predicate PRED in their 
 # rolling over's group

data[,c(
 .SD[
   # last timestamp with PRED==TRUE
   PRED == 1, .(TRAN_DATTIM, SYSKEY)
 ][
      .SD,
      # assure that do not get itself
      on = .(TRAN_DATTIM <= TRAN_DATTIM, SYSKEY < SYSKEY),
      # in that cartesian product, get the latest.
      # DT must be ordered by TRAN_DATTIM!
      mult = "last",
      # j
      # return all .SD cols, plus new column x.TRAN_DATTIM 
      # I help me with env argument.
      # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-programming.html#:~:text=Substituting%20lists%20of%20arbitrary%20length
      cols,
      env = list(
        cols = I(c(colnames(.SD), "x.TRAN_DATTIM"))
      )
    ]
  ),
   by = rolling_over
  ][,
    # building recency and cleaning auxiliar vars
    `:=` (
      name = {
        x <- as.numeric(
          difftime(
              TRAN_DATTIM,
            x.TRAN_DATTIM,
            units = "days"
          )
        )  

        if(aggregated_fun == "EXP")
          x <- exp(-gamma * x)
        
        x
      },
      x.TRAN_DATTIM = NULL,
      PRED = NULL
    ),
    env = list(
      gamma = I(gamma),
      name = name
    )
  ]

}

 data2 <- build_recency(
    data = data,
    name = "RECENCY",
    predicate = if_else(
      Product == 'Eggs', TRUE, FALSE
    ),
    gamma = 0.001,
    rolling_over = 'Customer'
  )
  
  data2[]

Expected result:

     Customer SYSKEY TRAN_DATTIM Product RECENCY
      <char>  <num>      <Date>  <char>   <num>
 1:     John     12  2024-02-28    Eggs      NA
 2:     John     13  2024-02-28    Milk       0
 3:     John     14  2024-03-02   Bread       3
 4:    Sally     21  2024-03-02   Bread      NA
 5:    Sally     24  2024-03-02  Butter      NA
 6:    Sally     25  2024-03-03    Eggs      NA
 7:    Sally     26  2024-03-03    Wine       0
 8:      Tom     15  2024-03-02  Butter      NA
 9:      Tom     20  2024-03-02    Eggs      NA
10:      Tom     22  2024-03-02    Milk       0

Solution

  • A speedy data.table-only version would be

    data[  , PREDICATE_Date := fifelse(Product=="Eggs",TRAN_DATTIM,NA) 
       ][ , PREDICATE_Date := nafill(PREDICATE_Date,"locf"), by=Customer
       ][ , RECENCY := TRAN_DATTIM-PREDICATE_Date
       ][ , RECENCY := fifelse(Product=="Eggs", NA, RECENCY)
       ]
    

    short explanation:

    1. store date of predicate in separate column PREDICATE_Date
    2. fill the NA in PREDICATE_Date with the most recent date of that customer
    3. calulate time difference
    4. set time difference to NA in predicate rows.

    but collapse and timeplyr are indeed insanely powerful as @NicChr already explained. I think the most efficient way is to use collapse inside a data.table, i.e. just copy the approach of @NicChr into a data.table.

    Let's do some benchmarking :)
    I replicated your toy data set N=5e4 times to get a dataset with 150000 (unique) customers:

    library(data.table)
    library(dplyr)
    library(collapse)
    library(timeplyr)
    
    N <- 5e4 ## Number of replicates of each customer
    Data1 <- data[rep(1:10,N)] ## replicate data
    Data1[,Customer:=paste0(Customer,rep(1:N,each=10))] ## make customer names unique
    Data2 <- copy(Data1)
    Data3 <- copy(Data1)
    bench::mark(
    dplyr.collapse={
      out <- Data1 %>%
        # arrange(Customer) %>%
        mutate(id = fcumsum(Product == "Eggs", g = Customer)) %>%
        mutate(RECENCY = time_elapsed(TRAN_DATTIM, "days", rolling = FALSE,
                                      g = pick(Customer, id))) %>%
        mutate(RECENCY = if_else(id == 0, NA, RECENCY)) %>%
        mutate(RECENCY = if_else(frowid(pick(Customer, id)) == 1, NA, RECENCY))
    },
    datatable={
      Data2[ , PREDICATE_Date := fifelse(Product=="Eggs",TRAN_DATTIM,NA)
          ][ , PREDICATE_Date := nafill(PREDICATE_Date,"locf"), by=Customer
          ][ , RECENCY := TRAN_DATTIM-PREDICATE_Date
          ][ , RECENCY := fifelse(Product=="Eggs", NA, RECENCY)
          ]
    },
    datatable.collapse={
      Data3[, id:=fcumsum(Product=="Eggs",g=Customer)
          ][, RECENCY:=time_elapsed(TRAN_DATTIM, "days", rolling=FALSE,
                                    g=GRP(list(Customer,id)) )
          ][, RECENCY:=fifelse(id==0L, NA, RECENCY)
          ][, RECENCY:=fifelse(frowid(GRP(list(Customer, id))) == 1L, NA, RECENCY)]
    }, check=FALSE )
    
    
    
    # A tibble: 3 × 13
      expression              min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory    
      <bch:expr>         <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>    
    1 dplyr.collapse       49.81s   49.81s    0.0201   72.56MB     0        1     0     49.81s <NULL> <Rprofmem>
    2 datatable             9.67s    9.67s    0.103     2.34GB     1.45     1    14      9.67s <NULL> <Rprofmem>
    3 datatable.collapse 224.11ms 252.68ms    3.96     53.53MB     1.98     2     1   505.36ms <NULL> <Rprofmem>
    # ℹ 2 more variables: time <list>, gc <list>
    

    So, there are gigantic differences in speed.
    The data.table&collapse&timeplyr combination should manage your use case in well under a second :)



    EDIT:
    The main bottleneck in the dplyr approach is the usage of pick(...). When we replace it with GRP(list(...)), we get speeds much more similar to that of the data.table version