Search code examples
rlinear-regressionanovarcpparmadillo

Faster anova of regression models


I have the following toy data -

data<-data.frame(y=rnorm(1000),x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000), x4=rnorm(1000))

On this data, I am creating two models as below -

fit1 = lm(y ~ x1 + x3, data)
fit2 = lm(y ~ x2 + x3 + x4, data)

Finally, I am comparing these models using anova

anova(fit1, fit2)

Since I am running these tests on ~1 million separate datasets, I want to improve the performance and don't want to use lm and anova functions.

To speed up computation, I am using RcppArmadillo::fastLM, instead of lm but there is no available anova function to compare models. Is there a faster function (compared to lm), which also has a corresponding anova function?

Any suggestions to improve performance will be appreciated.

Here are the performance results of various lm versions used below. For linear regression fastLM is an obvious winner. Function anova2 by @Jay is much faster than anova function and works on fastLM. Function f by @Donald is also faster than anova function and works on fastLM.

microbenchmark::microbenchmark(
  base_lm_1 = fit1 <- lm(y ~ x1 + x3, data),
  base_lm_2 = fit2 <-lm(y ~ x2 + x3 + x4, data),
  lm.fit_1 = lm.fit1 <- with(data, .lm.fit(y = y, x = cbind(1, x1, x3))),
  lm.fit_2 = lm.fit2 <- with(data, .lm.fit(y = y, x = cbind(1, x2, x3, x4))),
  lm.fit_3 = lm.fit3 <- lm_fun(y ~ x1 + x3, data),
  lm.fit_4 = lm.fit4 <- lm_fun(y ~ x2 + x3 + x4, data),
  fastLm1 = fastLM1 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x1, x3))),
  fastLm2 = fastLM2 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x2, x3, x4))),
  anova_base = anova(fit1, fit2),
  Jay_fastLM = anova2(fastLM1, fastLM1),
  Jay_lm.fit = anova2(lm.fit3, lm.fit4),
  Donald = f(fastLM1, fastLM1),
  times = 100L,
  control=list(warmup=100L)
)
Unit: microseconds
       expr      min        lq      mean    median        uq       max neval    cld
  base_lm_1 1472.480 1499.2560 1817.2659 1532.3840 1582.2615 28370.870   100     e 
  base_lm_2 1657.745 1706.5505 1796.3631 1744.3945 1825.7435  4761.020   100     e 
   lm.fit_1   94.212  106.9020  112.3093  111.2235  116.7010   147.192   100 a     
   lm.fit_2  124.220  129.8080  134.4455  132.9830  138.2510   156.166   100 a     
   lm.fit_3  853.906  873.9035  902.5856  889.9715  917.9375  1028.415   100   cd  
   lm.fit_4  991.238 1006.7015 1213.7061 1021.5325 1045.8980 19379.399   100    d  
    fastLm1  368.289  390.7805  434.1467  422.0855  476.9085   584.761   100 a c   
    fastLm2  416.645  441.8660  481.0027  462.8850  514.0755   617.619   100 a c   
 anova_base 2021.982 2099.8755 2322.2707 2190.3340 2246.7800 15345.093   100      f
 Jay_fastLM  202.026  218.2580  229.6244  226.3405  238.9490   303.964   100 ab    
 Jay_lm.fit  200.028  216.0805  234.0143  229.7580  246.1870   292.268   100 ab    
     Donald  549.425  582.8105  612.6990  605.4400  625.5340  1079.989   100  bc 

Solution

  • We could hack stats:::anova.lmlist so that it works for lists produced by .lm.fit (notice the dot) and RcppArmadillo::fastLm. Please check stats:::anova.lmlist, if I didn't delete stuff you need!

    anova2 <- function(object, ...) {
      objects <- list(object, ...)
      ns <- vapply(objects, function(x) length(x$residuals), 0L)
      stopifnot(!any(ns != ns[1L])) 
      resdf <- vapply(objects, df.residual, 0L)
      resdev <- vapply(objects, function(x) crossprod(residuals(x)), 0)
      bigmodel <- order(resdf)[1L]
      dfs <- c(NA, -diff(resdf))[bigmodel]
      ssq <- c(NA, -diff(resdev))[bigmodel]
      df.scale <- resdf[bigmodel]
      scale <- resdev[bigmodel]/resdf[bigmodel]
      fstat <- (ssq/dfs)/scale
      p <- pf(fstat, abs(dfs), df.scale, lower.tail=FALSE)
      return(c(F=fstat, p=p))
    }
    

    These wrappers make .lm.fit and RcppArmadillo::fastLm a little more convenient:

    lm_fun <- function(fo, dat) {
      X <- model.matrix(fo, dat)
      fit <- .lm.fit(X, dat[[all.vars(fo)[1]]])
      fit$df.residual <- dim(X)[1] - dim(X)[2]
      return(fit)
    }
    
    fastLm_fun <- function(fo, dat) {
      fit <- RcppArmadillo::fastLm(model.matrix(fo, dat), dat[[all.vars(fo)[1]]])
      return(fit)
    }
    

    Use it

    fit1 <- lm_fun(y ~ x1 + x3, data)
    fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
    anova2(fit1, fit2)
    #         F         p 
    # 0.3609728 0.5481032 
    
    ## Or use `fastLm_fun` here, but it's slower.
    

    Compare

    anova(lm(y ~ x1 + x3, data), lm(y ~ x2 + x3 + x4, data))
    # Analysis of Variance Table
    # 
    # Model 1: y ~ x1 + x3
    # Model 2: y ~ x2 + x3 + x4
    #   Res.Df    RSS Df Sum of Sq     F Pr(>F)
    # 1    997 1003.7                          
    # 2    996 1003.4  1   0.36365 0.361 0.5481
    

    lm_fun (which outperforms fastLm_fun) combined with anova2 appears to be around 60% faster than the conventional approach:

    microbenchmark::microbenchmark(
      conventional={
        fit1 <- lm(y ~ x1 + x3, data)
        fit2 <- lm(y ~ x2 + x3 + x4, data)
        anova(fit1, fit2)
      },
      anova2={  ## using `.lm.fit`
        fit1 <- lm_fun(y ~ x1 + x3, data)
        fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
        anova2(fit1, fit2)
      },
      anova2_Fast={  ## using `RcppArmadillo::fastLm`
        fit1 <- fastLm_fun(y ~ x1 + x3, data)
        fit2 <- fastLm_fun(y ~ x2 + x3 + x4, data)
        anova2(fit1, fit2)
      },
      anova_Donald={  
        fit1 <- lm_fun(y ~ x1 + x3, data)
        fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
        anova_Donald(fit1, fit2)
      }, 
      times=1000L,
      control=list(warmup=100L)
    )
    # Unit: milliseconds
    #         expr      min       lq     mean   median       uq      max neval  cld
    # conventional 2.885718 2.967053 3.205947 3.018668 3.090954 40.15720  1000    d
    #       anova2 1.180683 1.218121 1.285131 1.233335 1.267833 23.81955  1000 a   
    #  anova2_Fast 1.961897 2.012756 2.179458 2.037854 2.087893 26.65279  1000   c 
    # anova_Donald 1.368699 1.409198 1.561751 1.430562 1.472148 33.12247  1000  b  
    

    Data:

    set.seed(42)
    data <- data.frame(y=rnorm(1000), x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000),
                       x4=rnorm(1000))