Search code examples
rforeachparallel-processingr-caretdoparallel

How to get same results using loop and parallel in R?


I test the influence of training data on the accuracy of classification. For example, I use iris data. I noticed that I get the best accuracy from 33 iteration. I would like to use the training set (iristrain) from iteration for further analysis. I don't know how to reproduce it. I do not want to save the training set every every iteration because it is large. I would only like to get this set from 33 iteration. I tried this: clusterSetRNGStream() Then I used the same seed in a loop and it didn't give the same results

library(randomForest)
library(caret)
library(foreach)
library(doParallel)

results_overall <- data.frame()

cores = detectCores()
cl = makeCluster(cores - 1)

registerDoParallel(cl)
res <- foreach(i = 1:50, .packages = c("caret", "randomForest"), .combine = rbind) %dopar% {

trainIndex <- caret::createDataPartition(iris$Species, p = 0.5, list = FALSE)
irisTrain <- iris[ trainIndex,]
irisTest  <- iris[-trainIndex,]

model <- randomForest(x = irisTrain[,c(1:4)], y = irisTrain[,5], importance = TRUE,
                                            replace = TRUE, mtry = 4, ntree = 500, na.action=na.omit,
                                            do.trace = 100, type = "classification")

pred_test <- predict(model, irisTest[,c(1:4)])
con.mat_test <- confusionMatrix(pred_test, irisTest[,5], mode ="everything")

results_overall <- rbind(results_overall, con.mat_test[["overall"]])

return(tibble::tribble(~iteration, ~overall, 
                       i, results_overall))
}
stopCluster(cl)

Solution

  • Your iterations are giving differences based on the random sampling performed by caret::createDataPartition. To make this reproducible, you can use the doRNG package written for this purpose - a big thanks to @HenrikB for enlightening me about this!

    Edit: fixed the foreach function (did not change the result)

    invisible(suppressPackageStartupMessages(
        lapply(c("data.table", "randomForest", "caret", "foreach", 
                 "doRNG", "rngtools", "doParallel"),
               require, character.only = TRUE)))
    cores = detectCores()
    cl = makeCluster(cores - 1)
    registerDoParallel(cl)
    res <- foreach(i = 1:50, .packages = c("caret", "randomForest", "data.table"), .combine = rbind,
                   .options.RNG=1234) %dorng% {
                       trainIndex <- caret::createDataPartition(iris$Species, p = 0.5, list = FALSE)
                       irisTrain <- iris[ trainIndex,]
                       irisTest  <- iris[-trainIndex,]
                       model <- randomForest(x = irisTrain[,c(1:4)], y = irisTrain[,5], importance = TRUE,
                                             replace = TRUE, mtry = 4, ntree = 500, na.action=na.omit,
                                             do.trace = 100, type = "classification")
                       pred_test <- predict(model, irisTest[,c(1:4)])
                       con.mat_test <- confusionMatrix(pred_test, irisTest[,5], mode ="everything")
                       return(data.table(Iteration=i, t(con.mat_test[["overall"]])))
                   }
    stopCluster(cl)
    seeds <-  attr(res, 'rng')
    res[which.min(Accuracy),]
    #>    Iteration  Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
    #> 1:         6 0.9066667  0.86     0.8171065     0.9616461    0.3333333
    #>    AccuracyPValue McnemarPValue
    #> 1:    4.39803e-25           NaN
    
    best.seed <- res[which.min(Accuracy),]$Iteration
    
    rngtools::setRNG(seeds[[best.seed]])
    trainIndex <- caret::createDataPartition(iris$Species, p = 0.5, list = FALSE)
    irisTrain <- iris[ trainIndex,]
    irisTest  <- iris[-trainIndex,]
    
    model <- randomForest(x = irisTrain[,c(1:4)], y = irisTrain[,5], importance = TRUE,
                          replace = TRUE, mtry = 4, ntree = 500, na.action=na.omit,
                          do.trace = 100, type = "classification")
    #> ntree      OOB      1      2      3
    #>   100:   4.00%  0.00%  4.00%  8.00%
    #>   200:   2.67%  0.00%  4.00%  4.00%
    #>   300:   2.67%  0.00%  4.00%  4.00%
    #>   400:   2.67%  0.00%  4.00%  4.00%
    #>   500:   4.00%  0.00%  4.00%  8.00%
    pred_test <- predict(model, irisTest[,c(1:4)])
    con.mat_test <- confusionMatrix(pred_test, irisTest[,5], mode ="everything")
    con.mat_test[["overall"]]
    #>       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
    #>   9.066667e-01   8.600000e-01   8.171065e-01   9.616461e-01   3.333333e-01 
    #> AccuracyPValue  McnemarPValue 
    #>   4.398030e-25            NaN
    

    Created on 2020-05-05 by the reprex package (v0.3.0)