r performance parallel-processing regression

Does it make sense to use parallel with a bayesian regression?

I am runing a bayesian regression in R with 1000 chains and it is taking to long. In my attempt to improve the performance I tried parallel. I read that this technique is useful only when there are tasks that can be performed independently from each other. As I see the Markov Chains as iterations, I belived that this was the case. However, I am getting a worse speed compared to not using parallel.

If the problem is that parallel is not useful in this case. Is there another way to make it faster?

These are the libraries and replicable datasets:


library(rstan)
library(rstanarm)
library(tidyverse)
library(stringr)
library(openxlsx)
library(tictoc)
library(parallel)

tasas_ori <- data.frame(
  gedad = c(rep(c("A", "B", "C", "D", "E", "F"), 6)), 
  employment = c(rep(c(20:25), 6),
                 rep(c(20:25) + 1, 6),
                 rep(c(20:25) + 2, 6),
                 rep(c(20:25) + 3, 6),
                 rep(c(20:25) + 4, 6),
                 rep(c(20:25) + 5, 6)),
  anio = c(rep(2006, 6), rep(2012, 6), rep(2014, 6),
           rep(2018, 6), rep("2018 - aj.", 6), rep(2023, 6))
)



tasas <- tasas_ori %>% 
  filter(!(anio %in% c("2018-aj.", "2018"))) %>% 
  mutate(anio = as.numeric(anio) - 2006)

tasas2 <- tasas_ori %>% 
  filter(!(anio %in% c("2018-aj."))) %>% 
  mutate(anio = as.numeric(anio) - 2006)

This is the code with parallel:

set.seed(1984)

tic()

num_cores <- parallel::detectCores()
cl <- makeCluster(num_cores - 2)
registerDoParallel(cl)


for (i in 1:3) {
  
  
  # Parallel loop to fit stan_glm models
  bayes_reg <- foreach(i = 1:1, .combine = "c") %dopar% {
    library(rstan)  # Load rstan within the parallel worker
    library(rstanarm)  # Load rstanarm within the parallel worker
    library(dplyr)
    
    stan_glm(as.formula(paste("employment", "~ anio")),
             data = tasas %>% filter(gedad == unique(tasas$gedad)[1]),
             chain = 50,
             # prior = normal(normal_reg$coefficients[2], summary(normal_reg)$coefficients[4]),
             refresh = 0
    )
  }
  
  # Stop the parallel processing
  
  summary(bayes_reg, digits = 5) %>% print()
  
  coefficients_by[i] <- as.data.frame(bayes_reg$coefficients)
  predictions_by[i] <- as.data.frame(predict(bayes_reg, newdata = tasas2 %>%
                                               filter(gedad == unique(tasas$gedad)[i])))
  
  
  
}

stopCluster(cl)
toc()

And this is the code without parallel:

# Regresión por grupos etarios

coefficients_by <- vector("list")
predictions_by <- vector("list")

bayes <- vector("list")

set.seed(1984)

tic()


for (i in 1:3) {
  
  
  cat(paste("\n", "**** Regresión bayesiana", "-", "Grupo de edad: ", unique(dci_tasas$gedad)[i], "****", "\n"))
  
  bayes_reg <- stan_glm(as.formula(paste("employment", "~ anio")), 
                        data  = dci_tasas %>% 
                          filter(gedad == unique(dci_tasas$gedad)[i]), 
                        chain = 50, 
                        # prior_intercept = normal(normal_reg$coefficients[1], summary(normal_reg)$coefficients[3]),
                        # prior = normal(normal_reg$coefficients[2], summary(normal_reg)$coefficients[4]),
                        refresh = 0
  )
  
  
  summary(bayes_reg, digits = 5) %>% print()
  
  coefficients_by[i] <- as.data.frame(bayes_reg$coefficients)
  predictions_by[i] <- as.data.frame(predict(bayes_reg, newdata = dci_tasas2 %>% 
                                               filter(gedad == unique(dci_tasas$gedad)[i])))
  
  
  
}
toc()
  


stopCluster(cl)
toc()

Solution

I think the problem is that you're not actually using foreach() as it's intended. The foreach() function should execute the loop. Looping over 1 value really means that it's not running in parallel. In fact, you're suffering all the startup time of making a cluster without generating any of the benefits. If you use foreach() to do the looping, you'll see you get not quite a 100% increase in efficiency.

library(rstan)
library(rstanarm)
library(dplyr)
library(tictoc)

tasas_ori <- data.frame(
  gedad = c(rep(c("A", "B", "C", "D", "E", "F"), 6)), 
  employment = c(rep(c(20:25), 6),
                 rep(c(20:25) + 1, 6),
                 rep(c(20:25) + 2, 6),
                 rep(c(20:25) + 3, 6),
                 rep(c(20:25) + 4, 6),
                 rep(c(20:25) + 5, 6)),
  anio = c(rep(2006, 6), rep(2012, 6), rep(2014, 6),
           rep(2018, 6), rep(2019, 6), rep(2023, 6))
)

tasas <- tasas_ori %>% 
  filter(!(anio %in% c(2018, 2019))) %>% 
  mutate(anio = as.numeric(anio) - 2006)

In the parallel code below, I save each result as a list and then the output from foreach(i=1:3, ...) %dopar% ... is a list with three elements - one of reach of the models run. I setup a cluster with three cores (since the loop is over three values, more cores wouldn't matter). The result takes 4.6 seconds to computer.

set.seed(1984)

library(foreach)
library(doParallel)
#> Loading required package: iterators
#> Loading required package: parallel
pcl<-makeCluster(3)  
registerDoParallel(pcl)

tic()
bayes_reg <- foreach(i = 1:3, .combine = "c") %dopar% {
  library(rstanarm)
  library(dplyr)
  tasas_ori <- data.frame(
    gedad = c(rep(c("A", "B", "C", "D", "E", "F"), 6)), 
    employment = c(rep(c(20:25), 6),
                   rep(c(20:25) + 1, 6),
                   rep(c(20:25) + 2, 6),
                   rep(c(20:25) + 3, 6),
                   rep(c(20:25) + 4, 6),
                   rep(c(20:25) + 5, 6)),
    anio = c(rep(2006, 6), rep(2012, 6), rep(2014, 6),
             rep(2018, 6), rep(2019, 6), rep(2023, 6))
  )
  tasas <- tasas_ori %>% 
    filter(!(anio %in% c(2018, 2019))) %>% 
    mutate(anio = as.numeric(anio) - 2006)
  
  res <- stan_glm(as.formula(paste("employment", "~ anio")),
           data = tasas %>% filter(gedad == unique(tasas$gedad)[1]),
           chain = 50,
           # prior = normal(normal_reg$coefficients[2], summary(normal_reg)$coefficients[4]),
           refresh = 0
  )
  list(res)
}
stopCluster(pcl)
toc()
#> 4.62 sec elapsed

In the code below, I use a for() loop without parallelization, just like you did. You can see that the result takes 7.4 seconds to compute. So, the for loop is 7.4/4.6 = 1.6 or 60% slower. This is about what you might expect. The setup and communication overhead means that you're not going to get 3x the performance for 3x the cores.

tic()
bayes_reg2 <- vector(mode="list", length=3)
for(i in 1:3){
  bayes_reg2[[i]] <- stan_glm(as.formula(paste("employment", "~ anio")),
                              data = tasas %>% filter(gedad == unique(tasas$gedad)[1]),
                              chain = 50,
                              # prior = normal(normal_reg$coefficients[2], summary(normal_reg)$coefficients[4]),
                              refresh = 0)
}
toc()
#> 7.386 sec elapsed

^{Created on 2023-09-03 with reprex v2.0.2}