Here is my sample code that I want to speed up
df <- as.data.frame(cbind(1:100,cumsum(rnorm(100))))
colnames(df) <- c("time","val")
n <- 10
roll_lm <- rep(NA,nrow(df))
for(i in n:nrow(df)){
roll_df <- df[(i-(n-1)):i,]
mod <- lm(val ~ time , roll_df)
roll_lm[i] <- tail(predict(mod, roll_df),1)
}
plot(df)
lines(roll_lm, col=4,lwd=2)
How can this code be made as fast as possible?
Here are different functions for different implementations
==========================UPD==========================
fu1 <- function(df,n=10){
roll_lm <- rep(NA,nrow(df))
for(i in n:nrow(df)){
roll_df <- df[(i-(n-1)):i,]
mod <- lm(val ~ time , roll_df)
roll_lm[i] <- tail(predict(mod, roll_df),1)
}
return(roll_lm)
}
fu2 <- function(df,n=10){
roll2 <- apply(embed(df$val, n), #creates a moving-window matrix
1, #apply by row
function(x) head(predict(lm(x ~ seq_along(x))), 1))
roll2 <- c(rep(NA, 9), roll2)
return(roll2)
}
fu3 <- function(df,n=10){
roll_lm <- rep(NA_real_, nrow(df))
for (i in n:nrow(df)) {
x <- df$time[(i-(n-1)):i]
y <- df$val[(i-(n-1)):i]
b1 <- cov(x, y) / var(x, x)
b0 <- mean(y) - b1*mean(x)
roll_lm[i] <- b0 + df$time[i] * b1
}
return(roll_lm)
}
fu4 <- function(df,n=10){
library(RcppRoll)
roll_lm <- rep(NA,nrow(df))
x <- df$time
y <- df$val
for(i in n:nrow(df)){
roll_x <- x[(i-(n-1)):i]
roll_y <- y[(i-(n-1)):i]
roll_mean_x <- roll_mean(roll_x, n)
roll_mean_y <- roll_mean(roll_y, n)
roll_cov_xy <- roll_sum((roll_x - roll_mean_x) * (roll_y - roll_mean_y), n) / (n - 1)
roll_var_x <- roll_var(roll_x, n)
beta_hat <- roll_cov_xy / roll_var_x
alpha_hat <- roll_mean_y - beta_hat * roll_mean_x
roll_lm[i] <- tail(alpha_hat + beta_hat * roll_x, 1)
}
return(roll_lm)
}
fu5 <- function(df,n=10){
library(rollRegres)
fit <- roll_regres(val ~ time, data = df, width = n)
return(fit$coefs[,1] + fit$coefs[,2] * df$time)
}
================================================
Here is a comparison of the performance of different implementations
microbenchmark::microbenchmark(fu1(df),fu2(df),fu3(df),fu4(df),fu5(df))
Unit: milliseconds
expr min lq mean median uq max neval
fu1(df) 315.878570 338.35775 379.62677 359.243318 390.317795 831.82362 100
fu2(df) 233.707004 245.90377 273.49740 256.383181 288.005876 697.46965 100
fu3(df) 13.957729 15.37773 18.01751 16.442809 18.554429 67.16131 100
fu4(df) 25.305796 27.50037 32.42579 29.063492 32.757900 100.82235 100
fu5(df) 1.782347 2.02923 2.79950 2.179755 3.030447 16.40774 100
Still slow... but not using the data.frame and calculate by hand instead of lm() seems to cut the time by more than 90%:
fu3 <- function(df, n=10){
roll_lm <- rep(NA_real_, nrow(df))
for (i in n:nrow(df)) {
r <- (i-n+1):i
x <- df$time[r]
y <- df$val [r]
b1 <- cov(x, y) / var(x, x)
b0 <- mean(y) - b1*mean(x)
roll_lm[i] <- b0 + b1*df$time[i]
}
return(roll_lm)
}
> microbenchmark::microbenchmark(fu1(df),fu2(df),fu3(df))
Unit: milliseconds
expr min lq mean median uq max neval
fu1(df) 50.9371 57.68270 74.039318 63.32395 84.15985 205.5834 100
fu2(df) 42.8663 47.35755 61.422303 53.56985 67.13825 201.6872 100
fu3(df) 2.4844 2.62870 3.745455 2.85190 3.65145 17.2057 100