rperformance

calculate a sequence of expressions efficiently


I have some data

set.seed(1)
n <- 100
df <- data.frame(
  x = sample(1:30, n, replace = T),
  y = sample(1:30, n, replace = T),
  z = sample(1:30, n, replace = T)
)

and vector with expressions, they may be different.

rules <- c("df$x[i] < df$y[i-2] - df$x[i]", 
           "df$y[i] >= mean(df$x)",
           "df$y[i] == 20",
           "df$z[i-30] >= df$x[5]",
           "df$y[i-5] == 16",
           "df$x[10] > sd(as.matrix(df[(i-5):i,]))")

Next, I have a function that sequentially searches for the triggering of the first expression, then the second, and so on

seq_rules <- function(df, rules, show=T){
  
ln <- length(rules)
res <- matrix(0,nrow = ln, ncol = 2, dimnames = list(NULL, c("row","res")))
n <- 1

  for(i in 30:nrow(df)){
    if(eval(str2expression(rules[n]))){
      res[n,"row"] <- i
      res[n,"res"] <- 1
      if(show) print( cbind.data.frame(df[i,], rule=rules[n], row=i))
      n <- n+1
    }
    if(n>ln) break
  }
res
}

I would like to speed up my code. How would you write this code to make it as fast as possible? I also like your solution to be identical to mine on different seeds

=======================================

if the rules are represented as already evaluated functions

Frules <- lapply(rules,\(x) eval(str2expression(paste("function(i) {", x ,"}"))))

Then i can gain a little speed due to the absence of eval(str2expression..)) in the loop

New function

Fseq_rules <- function(df, rules){
  ln <- length(rules)
  res <- matrix(0,nrow = ln, ncol = 2, dimnames = list(NULL, c("row","res")))
  n <- 1
for(i in 30:nrow(df)){
    if(rules[[n]](i)){
        res[n,"row"] <- i
        res[n,"res"] <- 1
        n <- n+1
      }
    if(n>ln) break
  }
  res
}

microbenchmark::microbenchmark(Fseq_rules(df, Frules),
                                seq_rules(df, rules,show = F),times = 100)
Unit: milliseconds
                           expr      min       lq     mean   median       uq      max neval
          Fseq_rules(df, Frules) 1.083315 1.118951 1.283135 1.156011 1.247808 5.601309   100
 seq_rules(df, rules, show = F) 2.495045 2.545790 2.779712 2.607938 2.861662 6.243315   100

Solution

  • Not much faster than your original:

    rules2 <- c(
      "x[i] < y[i-2] - x[i]", 
      "y[i] >= mean(x)",
      "y[i] == 20",
      "z[i-30] >= x[5]",
      "y[i-5] == 16",
      "x[10] > sd(as.matrix(df[(i-5):i,]))"
    )
    
    seq_rules2 <- function(df, rules) {
      rules <- sapply(rules, str2expression)
      M <- length(rules)
      res <- matrix(0L, nrow = M, ncol = 2L, dimnames = list(NULL, c("row", "res")))
      j <- 1L
      
      for (i in 30:nrow(df)) {
        if (eval(rules[[j]], envir = df)) {
          res[j, ] <- c(i, 1L)
          j <- j + 1L
        }
        if(j > M) break
      }
      res
    }
    
    bench::mark(seq_rules(df, rules), seq_rules2(df, rules2))
    
    

    enter image description here

    You will gain a lot of speed if you replace df by a matrix. And change the rules accordingly:

    M <- as.matrix(df)
    
    rules_matrix <- c(
      "df[i, 'x'] < y[i-2] - x[i]", 
      "df[i, 'y'] >= mean(df[, 'x'])",
      "df[i, 'y'] == 20",
      "df[i-30, 'z'] >= df[5, 'x']",
      "df[i-5, 'y'] == 16",
      "df[10, 'x'] > sd(df[(i-5):i, ])"
    )
    
    seq_rules_matrix <- function(df, rules) {
      rules <- sapply(rules, str2expression)
      M <- length(rules)
      res <- matrix(0L, nrow = M, ncol = 2L, dimnames = list(NULL, c("row", "res")))
      j <- 1L
      
      for (i in 30:nrow(df)) {
        if (eval(rules[[j]])) {
          res[j, ] <- c(i, 1L)
          j <- j + 1L
        }
        if(j > M) break
      }
      res
    }
    
    bench::mark(
      mat = seq_rules_matrix(M, rules_matrix),
      df = seq_rules2(df, rules2)
    )
    
    

    enter image description here