I would like to use R's mlr3* packages to build ML algos in a reproducible manner. I have tried to use regr.glmboost learner with mbo tuner and run_time terminator. I have played around with the HPO part but I have not been able to make it reproducible with higher runtimes. Where did I go wrong?
Here is reprex about the phenomenon:
library(mlr3verse)
library(mlr3mbo)
library(mlr3misc)
library(magrittr)
library(nycflights13)
dt <- as.data.table(weather)
dt <- dt[order(time_hour), .(origin = as.factor(origin), month = as.factor(month), hour = as.factor(hour), temp, dewp, humid, wind_dir, wind_speed, precip, visib, pressure, time_hour = as.numeric(time_hour))]
dt <- na.omit(dt)
best_ones <- map_dtr(
1L:3L,
function(i) {
my_learner <- lrn("regr.glmboost",
family = to_tune(p_fct(levels = c("Gaussian", "Laplace", "Huber"))),
nuirange = to_tune(p_dbl(lower = 0, upper = 1000, logscale = FALSE)),
mstop = to_tune(p_int(lower = 1, upper = 3, trafo = function(x) 10**x)),
nu = to_tune(p_dbl(lower = 0.01, upper = 0.3, logscale = TRUE)),
risk = to_tune(p_fct(levels = c("inbag", "oobag", "none"))),
trace = to_tune(c(TRUE, FALSE)),
stopintern = to_tune(c(TRUE, FALSE))
)
my_task <- as_task_regr(
x = dt,
target = "pressure",
id = "weather_data"
)
my_instance <- ti(
task = my_task,
learner = my_learner,
resampling = rsmp("cv", folds = 3),
measure = msr("regr.mae"),
terminator = trm("run_time", secs = 300)
)
my_tuner <- tnr("mbo")
set.seed(1234L, kind = "L'Ecuyer-CMRG")
my_tuner$optimize(my_instance)
my_instance$archive$best()
}
)
best_ones[]
These are the somewhat diverse hyperparameters what I have got:
family | nuirange | mstop | nu | risk | trace | stopintern | regr.mae | warnings | errors | runtime_learners | uhash | timestamp | batch_nr | acq_ei | .already_evaluated |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Huber | 841.3256 | 3 | -2.794395 | inbag | FALSE | FALSE | 5.090834 | 0 | 0 | 9.656 | 01cf38ab-3dc6-4490-b36e-1c14325e42ad | 2023-01-10 17:08:15 | 26 | 0.0010821 | FALSE |
Huber | 849.4117 | 3 | -2.774291 | oobag | FALSE | FALSE | 5.094204 | 0 | 0 | 9.646 | 6579c965-9184-4fe3-8e01-c1b10df21782 | 2023-01-10 17:11:56 | 18 | 0.0021940 | FALSE |
Huber | 855.7414 | 3 | -2.878846 | oobag | FALSE | FALSE | 5.096876 | 0 | 0 | 9.497 | 458122cc-f51c-4d81-a6d2-93dc024baa58 | 2023-01-10 17:16:22 | 15 | 0.0090615 | FALSE |
I guess the issue is around seeding, but I do not know how to make it the proper way in this case. Any help would be appreciated!
So I think there are two possible sources of error in your code:
map_dtr(...)
just to be sure.Addressing both seems to yield reproducible results (some of the code is slightly adjusted to keep the runtime lower).
I hope this also works for you.
library(mlr3verse)
#> Loading required package: mlr3
library(mlr3mbo)
#> Loading required package: mlr3tuning
#> Loading required package: paradox
library(mlr3misc)
library(magrittr)
#>
#> Attaching package: 'magrittr'
#> The following objects are masked from 'package:mlr3misc':
#>
#> set_class, set_names
library(nycflights13)
dt <- as.data.table(weather)
dt <- dt[order(time_hour), .(origin = as.factor(origin), month = as.factor(month), hour = as.factor(hour), temp, dewp, humid, wind_dir, wind_speed, precip, visib, pressure, time_hour = as.numeric(time_hour))]
dt <- na.omit(dt)
best_ones <- map_dtr(
1L:3L,
function(i) {
set.seed(1234L, kind = "L'Ecuyer-CMRG")
my_learner <- lrn("regr.glmboost",
family = to_tune(p_fct(levels = c("Gaussian", "Laplace", "Huber"))),
nuirange = to_tune(p_dbl(lower = 0, upper = 1000, logscale = FALSE)),
mstop = to_tune(p_int(lower = 1, upper = 3, trafo = function(x) 10**x)),
nu = to_tune(p_dbl(lower = 0.01, upper = 0.3, logscale = TRUE)),
risk = to_tune(p_fct(levels = c("inbag", "oobag", "none"))),
trace = to_tune(c(TRUE, FALSE)),
stopintern = to_tune(c(TRUE, FALSE))
)
my_task <- as_task_regr(
x = dt,
target = "pressure",
id = "weather_data"
)
my_instance <- ti(
task = my_task,
learner = my_learner,
resampling = rsmp("holdout"),
measure = msr("regr.mae"),
terminator = trm("evals", n_evals = 2)
)
my_tuner <- tnr("mbo")
my_tuner$optimize(my_instance)
my_instance$archive$best()
}
)
#> INFO [22:33:53.565] [bbotk] Starting to optimize 7 parameter(s) with '<OptimizerMbo>' and '<TerminatorEvals> [n_evals=2, k=0]'
#> ... (A LOT OF LOG OUTPUT THAT IS OMITTED)
best_ones[]
#> family nuirange mstop nu risk trace stopintern regr.mae warnings
#> 1: Huber 85.45087 3 -2.572761 none FALSE FALSE 4.74076 0
#> 2: Huber 85.45087 3 -2.572761 none FALSE FALSE 4.74076 0
#> 3: Huber 85.45087 3 -2.572761 none FALSE FALSE 4.74076 0
#> errors runtime_learners uhash x_domain
#> 1: 0 3.406 653adf83-6fbc-4ef5-b6dd-7e12e97b49d6 <list[7]>
#> 2: 0 3.773 f70dec6a-073a-45c8-b795-29ef5b662625 <list[7]>
#> 3: 0 3.673 dc18e13f-2e7d-4fe9-9845-187ebf0db3b3 <list[7]>
#> timestamp batch_nr
#> 1: 2023-01-10 22:34:17 1
#> 2: 2023-01-10 22:34:41 1
#> 3: 2023-01-10 22:35:05 1
Created on 2023-01-10 by the reprex package (v2.0.1)