I have the following toy data -
data<-data.frame(y=rnorm(1000),x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000), x4=rnorm(1000))
On this data, I am creating two models as below -
fit1 = lm(y ~ x1 + x3, data)
fit2 = lm(y ~ x2 + x3 + x4, data)
Finally, I am comparing these models using anova
anova(fit1, fit2)
Since I am running these tests on ~1 million separate datasets, I want to improve the performance and don't want to use lm
and anova
functions.
To speed up computation, I am using RcppArmadillo::fastLM
, instead of lm
but there is no available anova
function to compare models. Is there a faster function (compared to lm
), which also has a corresponding anova
function?
Any suggestions to improve performance will be appreciated.
Here are the performance results of various lm
versions used below. For linear regression fastLM
is an obvious winner. Function anova2
by @Jay is much faster than anova
function and works on fastLM. Function f
by @Donald is also faster than anova
function and works on fastLM.
microbenchmark::microbenchmark(
base_lm_1 = fit1 <- lm(y ~ x1 + x3, data),
base_lm_2 = fit2 <-lm(y ~ x2 + x3 + x4, data),
lm.fit_1 = lm.fit1 <- with(data, .lm.fit(y = y, x = cbind(1, x1, x3))),
lm.fit_2 = lm.fit2 <- with(data, .lm.fit(y = y, x = cbind(1, x2, x3, x4))),
lm.fit_3 = lm.fit3 <- lm_fun(y ~ x1 + x3, data),
lm.fit_4 = lm.fit4 <- lm_fun(y ~ x2 + x3 + x4, data),
fastLm1 = fastLM1 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x1, x3))),
fastLm2 = fastLM2 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x2, x3, x4))),
anova_base = anova(fit1, fit2),
Jay_fastLM = anova2(fastLM1, fastLM1),
Jay_lm.fit = anova2(lm.fit3, lm.fit4),
Donald = f(fastLM1, fastLM1),
times = 100L,
control=list(warmup=100L)
)
Unit: microseconds
expr min lq mean median uq max neval cld
base_lm_1 1472.480 1499.2560 1817.2659 1532.3840 1582.2615 28370.870 100 e
base_lm_2 1657.745 1706.5505 1796.3631 1744.3945 1825.7435 4761.020 100 e
lm.fit_1 94.212 106.9020 112.3093 111.2235 116.7010 147.192 100 a
lm.fit_2 124.220 129.8080 134.4455 132.9830 138.2510 156.166 100 a
lm.fit_3 853.906 873.9035 902.5856 889.9715 917.9375 1028.415 100 cd
lm.fit_4 991.238 1006.7015 1213.7061 1021.5325 1045.8980 19379.399 100 d
fastLm1 368.289 390.7805 434.1467 422.0855 476.9085 584.761 100 a c
fastLm2 416.645 441.8660 481.0027 462.8850 514.0755 617.619 100 a c
anova_base 2021.982 2099.8755 2322.2707 2190.3340 2246.7800 15345.093 100 f
Jay_fastLM 202.026 218.2580 229.6244 226.3405 238.9490 303.964 100 ab
Jay_lm.fit 200.028 216.0805 234.0143 229.7580 246.1870 292.268 100 ab
Donald 549.425 582.8105 612.6990 605.4400 625.5340 1079.989 100 bc
We could hack stats:::anova.lmlist
so that it works for lists produced by .lm.fit
(notice the dot) and RcppArmadillo::fastLm
. Please check stats:::anova.lmlist
, if I didn't delete stuff you need!
anova2 <- function(object, ...) {
objects <- list(object, ...)
ns <- vapply(objects, function(x) length(x$residuals), 0L)
stopifnot(!any(ns != ns[1L]))
resdf <- vapply(objects, df.residual, 0L)
resdev <- vapply(objects, function(x) crossprod(residuals(x)), 0)
bigmodel <- order(resdf)[1L]
dfs <- c(NA, -diff(resdf))[bigmodel]
ssq <- c(NA, -diff(resdev))[bigmodel]
df.scale <- resdf[bigmodel]
scale <- resdev[bigmodel]/resdf[bigmodel]
fstat <- (ssq/dfs)/scale
p <- pf(fstat, abs(dfs), df.scale, lower.tail=FALSE)
return(c(F=fstat, p=p))
}
These wrappers make .lm.fit
and RcppArmadillo::fastLm
a little more convenient:
lm_fun <- function(fo, dat) {
X <- model.matrix(fo, dat)
fit <- .lm.fit(X, dat[[all.vars(fo)[1]]])
fit$df.residual <- dim(X)[1] - dim(X)[2]
return(fit)
}
fastLm_fun <- function(fo, dat) {
fit <- RcppArmadillo::fastLm(model.matrix(fo, dat), dat[[all.vars(fo)[1]]])
return(fit)
}
Use it
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
# F p
# 0.3609728 0.5481032
## Or use `fastLm_fun` here, but it's slower.
Compare
anova(lm(y ~ x1 + x3, data), lm(y ~ x2 + x3 + x4, data))
# Analysis of Variance Table
#
# Model 1: y ~ x1 + x3
# Model 2: y ~ x2 + x3 + x4
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 997 1003.7
# 2 996 1003.4 1 0.36365 0.361 0.5481
lm_fun
(which outperforms fastLm_fun
) combined with anova2
appears to be around 60% faster than the conventional approach:
microbenchmark::microbenchmark(
conventional={
fit1 <- lm(y ~ x1 + x3, data)
fit2 <- lm(y ~ x2 + x3 + x4, data)
anova(fit1, fit2)
},
anova2={ ## using `.lm.fit`
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
},
anova2_Fast={ ## using `RcppArmadillo::fastLm`
fit1 <- fastLm_fun(y ~ x1 + x3, data)
fit2 <- fastLm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
},
anova_Donald={
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova_Donald(fit1, fit2)
},
times=1000L,
control=list(warmup=100L)
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# conventional 2.885718 2.967053 3.205947 3.018668 3.090954 40.15720 1000 d
# anova2 1.180683 1.218121 1.285131 1.233335 1.267833 23.81955 1000 a
# anova2_Fast 1.961897 2.012756 2.179458 2.037854 2.087893 26.65279 1000 c
# anova_Donald 1.368699 1.409198 1.561751 1.430562 1.472148 33.12247 1000 b
Data:
set.seed(42)
data <- data.frame(y=rnorm(1000), x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000),
x4=rnorm(1000))