r data.table benchmarking performancecounter

get elapsed time to most recent condition by group. Bench

I would like to speed up this code in data.table. I think I can do better. It is a typical looking backwards for the last TRUE predicate an then accounts the time to the current row. All this by group. I am running this with a 300.000 rows and 200.000 groups, and need to calculate many columns (metrics) like that. Hence, the velocity is important to me.

I made a smaller data Example:

 data <- data.table(
      SYSKEY = c(
        12, 13, 14, 15, 20, 
        22, 21, 24, 25, 26
      ), 
      Customer = c(
        "John", "John", "John", "Tom", "Tom", 
        "Tom", "Sally", "Sally", "Sally", "Sally"
      ), 
      TRAN_DATTIM = as.Date(
        c(
        "28-02-2024", "28-02-2024", "02-03-2024", "02-03-2024", "02-03-2024", 
        "02-03-2024", "02-03-2024", "02-03-2024", "03-03-2024", "03-03-2024"
        ),
        format="%d-%m-%Y", origin="01-01-1900"
      ), 
      Product = c(
        "Eggs", "Milk", "Bread", "Butter","Eggs", 
        "Milk", "Bread", "Butter", "Eggs", "Wine"
      )
  )

My function and execution code:

library(data.table)    

build_recency <- function(
    data,
    name,
    predicate,
    aggregated_fun = "TIME",
    gamma = 0.0001,
    rolling_over
){
  
  UseMethod("build_recency")
  
}


build_recency.data.table <- function(
    data,
    name,
    predicate,
    aggregated_fun = "TIME",
    gamma = 0.01,
    rolling_over = "PAN"
){

  predicate <- enexpr(predicate)

  data[,
    PRED := fifelse(eval(predicate) == TRUE, 1L, 0L)
  ]
  
  setorderv(
    data,
    c(rolling_over, "TRAN_DATTIM", "SYSKEY")
  )
  
 # make a left-join, that trail last TRUE predicate PRED in their 
 # rolling over's group

data[,c(
 .SD[
   # last timestamp with PRED==TRUE
   PRED == 1, .(TRAN_DATTIM, SYSKEY)
 ][
      .SD,
      # assure that do not get itself
      on = .(TRAN_DATTIM <= TRAN_DATTIM, SYSKEY < SYSKEY),
      # in that cartesian product, get the latest.
      # DT must be ordered by TRAN_DATTIM!
      mult = "last",
      # j
      # return all .SD cols, plus new column x.TRAN_DATTIM 
      # I help me with env argument.
      # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-programming.html#:~:text=Substituting%20lists%20of%20arbitrary%20length
      cols,
      env = list(
        cols = I(c(colnames(.SD), "x.TRAN_DATTIM"))
      )
    ]
  ),
   by = rolling_over
  ][,
    # building recency and cleaning auxiliar vars
    `:=` (
      name = {
        x <- as.numeric(
          difftime(
              TRAN_DATTIM,
            x.TRAN_DATTIM,
            units = "days"
          )
        )  

        if(aggregated_fun == "EXP")
          x <- exp(-gamma * x)
        
        x
      },
      x.TRAN_DATTIM = NULL,
      PRED = NULL
    ),
    env = list(
      gamma = I(gamma),
      name = name
    )
  ]

}

 data2 <- build_recency(
    data = data,
    name = "RECENCY",
    predicate = if_else(
      Product == 'Eggs', TRUE, FALSE
    ),
    gamma = 0.001,
    rolling_over = 'Customer'
  )
  
  data2[]

Expected result:

     Customer SYSKEY TRAN_DATTIM Product RECENCY
      <char>  <num>      <Date>  <char>   <num>
 1:     John     12  2024-02-28    Eggs      NA
 2:     John     13  2024-02-28    Milk       0
 3:     John     14  2024-03-02   Bread       3
 4:    Sally     21  2024-03-02   Bread      NA
 5:    Sally     24  2024-03-02  Butter      NA
 6:    Sally     25  2024-03-03    Eggs      NA
 7:    Sally     26  2024-03-03    Wine       0
 8:      Tom     15  2024-03-02  Butter      NA
 9:      Tom     20  2024-03-02    Eggs      NA
10:      Tom     22  2024-03-02    Milk       0

Solution

A speedy data.table-only version would be

data[  , PREDICATE_Date := fifelse(Product=="Eggs",TRAN_DATTIM,NA) 
   ][ , PREDICATE_Date := nafill(PREDICATE_Date,"locf"), by=Customer
   ][ , RECENCY := TRAN_DATTIM-PREDICATE_Date
   ][ , RECENCY := fifelse(Product=="Eggs", NA, RECENCY)
   ]

short explanation:

store date of predicate in separate column PREDICATE_Date
fill the NA in PREDICATE_Date with the most recent date of that customer
calulate time difference
set time difference to NA in predicate rows.

but collapse and timeplyr are indeed insanely powerful as @NicChr already explained. I think the most efficient way is to use collapse inside a data.table, i.e. just copy the approach of @NicChr into a data.table.

Let's do some benchmarking :)
I replicated your toy data set N=5e4 times to get a dataset with 150000 (unique) customers:

library(data.table)
library(dplyr)
library(collapse)
library(timeplyr)

N <- 5e4 ## Number of replicates of each customer
Data1 <- data[rep(1:10,N)] ## replicate data
Data1[,Customer:=paste0(Customer,rep(1:N,each=10))] ## make customer names unique
Data2 <- copy(Data1)
Data3 <- copy(Data1)
bench::mark(
dplyr.collapse={
  out <- Data1 %>%
    # arrange(Customer) %>%
    mutate(id = fcumsum(Product == "Eggs", g = Customer)) %>%
    mutate(RECENCY = time_elapsed(TRAN_DATTIM, "days", rolling = FALSE,
                                  g = pick(Customer, id))) %>%
    mutate(RECENCY = if_else(id == 0, NA, RECENCY)) %>%
    mutate(RECENCY = if_else(frowid(pick(Customer, id)) == 1, NA, RECENCY))
},
datatable={
  Data2[ , PREDICATE_Date := fifelse(Product=="Eggs",TRAN_DATTIM,NA)
      ][ , PREDICATE_Date := nafill(PREDICATE_Date,"locf"), by=Customer
      ][ , RECENCY := TRAN_DATTIM-PREDICATE_Date
      ][ , RECENCY := fifelse(Product=="Eggs", NA, RECENCY)
      ]
},
datatable.collapse={
  Data3[, id:=fcumsum(Product=="Eggs",g=Customer)
      ][, RECENCY:=time_elapsed(TRAN_DATTIM, "days", rolling=FALSE,
                                g=GRP(list(Customer,id)) )
      ][, RECENCY:=fifelse(id==0L, NA, RECENCY)
      ][, RECENCY:=fifelse(frowid(GRP(list(Customer, id))) == 1L, NA, RECENCY)]
}, check=FALSE )



# A tibble: 3 × 13
  expression              min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory    
  <bch:expr>         <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>    
1 dplyr.collapse       49.81s   49.81s    0.0201   72.56MB     0        1     0     49.81s <NULL> <Rprofmem>
2 datatable             9.67s    9.67s    0.103     2.34GB     1.45     1    14      9.67s <NULL> <Rprofmem>
3 datatable.collapse 224.11ms 252.68ms    3.96     53.53MB     1.98     2     1   505.36ms <NULL> <Rprofmem>
# ℹ 2 more variables: time <list>, gc <list>

So, there are gigantic differences in speed.
The data.table&collapse&timeplyr combination should manage your use case in well under a second :)

EDIT:
The main bottleneck in the dplyr approach is the usage of pick(...). When we replace it with GRP(list(...)), we get speeds much more similar to that of the data.table version