Search code examples
rparallel-processingmlrreproducible-research

mlr: why does reproducibility of hyperparameter tuning fail using parallelization?


I use code based on Quickstart example in mlr cheatsheet. I added parallelization and tried to tune parameters several times.

Question: Why does reproducibility fail (why aren't the results identical) even if I set set.seed() every time before tuning? What is missing in my code? How should I modify the code to achieve reproducibility?

The code (on my PC it runs up to 1 min.):

library(mlr)
#> Loading required package: ParamHelpers
library(parallel)
library(parallelMap)

# Load data
data(Soybean, package = "mlbench") 

# Initialize paralelllization
parallelStartSocket(cpus = 2)
#> Starting parallelization in mode=socket with cpus=2.

# Prepare data, task, learner
soy = createDummyFeatures(Soybean, target = "Class")
tsk = makeClassifTask(data = soy, target = "Class")
ho = makeResampleInstance("Holdout", tsk)
tsk.train = subsetTask(tsk, ho$train.inds[[1]])

lrn = makeLearner("classif.xgboost", nrounds = 10)
#> Warning in makeParam(id = id, type = "numeric", learner.param = TRUE, lower = lower, : NA used as a default value for learner parameter missing.
#> ParamHelpers uses NA as a special value for dependent parameters.

# Prepare for hyperparametar tuning
ps = makeParamSet(makeNumericParam("eta", 0, 1))
tc = makeTuneControlMBO(budget = 1)

# Turn off excessive output
configureMlr(show.info = FALSE, show.learner.output = FALSE)

# Tune parameters
suppressMessages({

    # set.seed(123456, "L'Ecuyer-CMRG")
    clusterSetRNGStream(iseed = 123456)
    tr1  = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)

    # set.seed(123456, "L'Ecuyer-CMRG")
    clusterSetRNGStream(iseed = 123456)
    tr2  = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)

})

# Stop paralellization
parallelStop()
#> Stopped parallelization. All cleaned up.

The results are not identical:

all.equal(tr1, tr2)
#>  [1] "Component \"x\": Component \"eta\": Mean relative difference: 0.1849302"                                                                                                             
#>  [2] "Component \"y\": Mean relative difference: 1.074668e-05"                                                                                                                             
#>  [3] "Component \"resampling\": Component \"train.inds\": Component 1: Numeric: lengths (228, 227) differ"                                                                                 
#>  [4] "Component \"resampling\": Component \"train.inds\": Component 2: Numeric: lengths (227, 228) differ"                                                                                 
#>  [5] "Component \"resampling\": Component \"test.inds\": Component 1: Numeric: lengths (227, 228) differ"                                                                                  
#>  [6] "Component \"resampling\": Component \"test.inds\": Component 2: Numeric: lengths (228, 227) differ"                                                                                  
#>  [7] "Component \"mbo.result\": Component \"x\": Component \"eta\": Mean relative difference: 0.1849302"                                                                                   
#>  [8] "Component \"mbo.result\": Component \"y\": Mean relative difference: 1.074668e-05"                                                                                                   
#>  [9] "Component \"mbo.result\": Component \"opt.path\": Component \"env\": Component \"exec.time\": Mean relative difference: 0.1548913"                                                   
#> [10] "Component \"mbo.result\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"eta\": Mean relative difference: 0.773126"                                      
#> [11] "Component \"mbo.result\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"y\": Mean relative difference: 0.03411588"                                      
#> [12] "Component \"mbo.result\": Component \"final.opt.state\": Component \"loop.starttime\": Mean absolute difference: 1.810968"                                                           
#> [13] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.path\": Component \"env\": Component \"exec.time\": Mean relative difference: 0.1548913"                    
#> [14] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"eta\": Mean relative difference: 0.773126"       
#> [15] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"y\": Mean relative difference: 0.03411588"       
#> [16] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.problem\": Component \"design\": Component \"eta\": Mean relative difference: 0.773126"                     
#> [17] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.result\": Component \"mbo.result\": Component \"x\": Component \"eta\": Mean relative difference: 0.1849302"
#> [18] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.result\": Component \"mbo.result\": Component \"y\": Mean relative difference: 1.074668e-05"                
#> [19] "Component \"mbo.result\": Component \"final.opt.state\": Component \"random.seed\": Mean relative difference: 1.28965"                                                               
#> [20] "Component \"mbo.result\": Component \"final.opt.state\": Component \"time.created\": Mean absolute difference: 5.489337"                                                             
#> [21] "Component \"mbo.result\": Component \"final.opt.state\": Component \"time.last.saved\": Mean absolute difference: 5.489337"                                                          
#> [22] "Component \"mbo.result\": Component \"final.opt.state\": Component \"time.used\": Mean relative difference: 0.6841712"

I also tried

set.seed(123456, "L'Ecuyer-CMRG")

instead of

parallel::clusterSetRNGStream(iseed = 123456)

and this did not lead to reproducibility.

But when parallelization is turned off, the results are identical (with set.seed(123456, "L'Ecuyer-CMRG") (except beginning/end time and duration).


Solution

  • The following code creates the same reproducible results (except the timings)

    library(mlr)
    library(parallel)
    library(parallelMap)
    
    # Load data
    data(Soybean, package = "mlbench") 
    
    # Initialize paralelllization
    parallelStartSocket(cpus = 2)
    
    # Prepare data, task, learner
    soy = createDummyFeatures(Soybean, target = "Class")
    tsk = makeClassifTask(data = soy, target = "Class")
    ho = makeResampleInstance("Holdout", tsk)
    tsk.train = subsetTask(tsk, ho$train.inds[[1]])
    
    lrn = makeLearner("classif.xgboost", nrounds = 10)
    
    # Prepare for hyperparametar tuning
    ps = makeParamSet(makeNumericParam("eta", 0, 1))
    tc = makeTuneControlMBO(budget = 1)
    
    # Turn off excessive output
    configureMlr(show.info = FALSE, show.learner.output = FALSE)
    
    # Tune parameters
    suppressMessages({
    
      set.seed(123456, "L'Ecuyer-CMRG")
      clusterSetRNGStream(iseed = 123456)
      tr1  = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)
    
      set.seed(123456, "L'Ecuyer-CMRG")
      clusterSetRNGStream(iseed = 123456)
      tr2  = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)
    
    })
    
    parallelStop()
    

    What did I change? I also set the local seed. Why? Because it's not only about the seeding on the parallel processes. Also the seeding on the main machine is important as it influences eg. the resampling (which is drawn on the master).