Search code examples
rforeachparallel-processingdoparallel

Foreach and doparallel instead of for loop in R


I need to speed up the for loop through multithreading. I would like to use the libraries for this: foreach and doParallel. I used these packages before but only for processes where one result table was needed. I don't know how to use them to export multiple tables (here results tables). My problem is much more complex and requires exporting many result sets. Here, for simplicity, I use iris data.

library(randomForest)
library(caret)

results_class <- data.frame()
results_overall <- data.frame()

for(i in 1:50){
  trainIndex <- caret::createDataPartition(iris$Species, p = 0.5, list = FALSE)
  irisTrain <- iris[ trainIndex,]
  irisTest  <- iris[-trainIndex,]

  model <- randomForest(x = irisTrain[,c(1:4)], y = irisTrain[,5], importance = TRUE,
                        replace = TRUE, mtry = 4, ntree = 500, na.action=na.omit,
                        do.trace = 100, type = "classification")

  pred_test <- predict(model, irisTest[,c(1:4)])
  con.mat_test <- confusionMatrix(pred_test, irisTest[,5], mode ="everything")

  results_class <- rbind(results_class, con.mat_test[["byClass"]])
  results_overall <- rbind(results_overall, con.mat_test[["overall"]])

}

Solution

  • As far as I know it's not easy (or even possible) to modify variables outside of the foreach loop, so what about storing multiple results in one nested tibble?

    library(randomForest)
    library(caret)
    library(foreach)
    library(doParallel)
    
    # Set up parallel computing
    cl <- makeCluster(detectCores(logical = TRUE))
    registerDoParallel(cl)
    
    res <- foreach(i = 1:50, .packages = c("caret", "randomForest"), .combine = rbind) %dopar% {
        trainIndex <- caret::createDataPartition(iris$Species, p = 0.5, list = FALSE)
        irisTrain <- iris[ trainIndex,]
        irisTest  <- iris[-trainIndex,]
    
        model <- randomForest(x = irisTrain[,c(1:4)], y = irisTrain[,5], importance = TRUE,
                              replace = TRUE, mtry = 4, ntree = 500, na.action=na.omit,
                              do.trace = 100, type = "classification")
    
        pred_test <- predict(model, irisTest[,c(1:4)])
        con.mat_test <- confusionMatrix(pred_test, irisTest[,5], mode ="everything")
    
        # Save class into separate variable
        # Use substr to get rid of "Class: "
        class <- data.frame(con.mat_test[["byClass"]])
        overall <- data.frame(con.mat_test[["overall"]])
        class$class <- sapply(rownames(class), function(x) substr(x, 8, nchar(x)))
        overall$class <- sapply(rownames(overall), function(x) substr(x, 8, nchar(x)))
    
        # Save output dataframe in tibble as list column
        return(tibble::tibble(iteration = i, 
                              class = list(class), 
                              overall = list(overall)))
    }
    
    # Stop the cluster
    stopCluster(cl)
    registerDoSEQ()
    

    The output is then as follows:

    > print(res)
    # A tibble: 50 x 3
       iteration class              overall         
           <int> <list>             <list>          
     1         1 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
     2         2 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
     3         3 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
     4         4 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
     5         5 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
     6         6 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
     7         7 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
     8         8 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
     9         9 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
    10        10 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
    # ... with 40 more rows