machine-learning scikit-learn random-forest skopt bayessearchcv

Error using BayesSearchCV from skopt on RandomForestClassifier

this is the code to reproduce the error:

from sklearn.ensemble        import RandomForestClassifier
from sklearn.linear_model    import LogisticRegression
from scipy.stats import loguniform
from skopt import BayesSearchCV
from sklearn.datasets import load_iris
import numpy as np

X, y = load_iris(return_X_y=True)

grid = {
     
     'LogisticRegression' : {
          'C':             loguniform.rvs(0.1, 10000, size = 50),
          'solver':        ['lbfgs','saga'],
          'penalty':       ['l2'],
          'warm_start':    [False, True],
          'class_weight' : [None, 'balanced'],
          'max_iter':      [100, 1000],
          'n_jobs':        [ 10 ]
        },
     
     'RandomForestClassifier' : {
         'n_estimators': np.random.randint(5, 200, size=10),
         'criterion'   : [ 'gini', 'entropy' ],
         'max_depth'   : np.random.randint(5, 50, size=10),
         'min_samples_split': np.random.randint(5, 50, size=10),
         'min_samples_leaf':  np.random.randint(5, 50, size=10),
         'max_features' :     loguniform.rvs(0.2, 1.0, size=5),
         'n_jobs'  : [ 10 ]
        }
}

tuner_params = {
    'cv':      2, 
    'n_jobs':  10, 
    'scoring': 'roc_auc_ovr', 
    'return_train_score': True, 
    'refit': True,
    'n_iter':3
}

clf = 'LogisticRegression'
search_cv = BayesSearchCV( estimator = eval(clf)(), search_spaces = grid[clf],  **tuner_params) 
search_cv.fit(X,y)

clf = 'RandomForestClassifier'
search_cv = BayesSearchCV( estimator = eval(clf)(), search_spaces = grid[clf],  **tuner_params) 
search_cv.fit(X,y)

Using BayesSearchCV on LogisticRegression as classifier gives no error, while using RandomForestClassifier it gives the following error:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Input In [8], in <cell line: 2>()
      1 search_cv = BayesSearchCV( estimator = eval(clf)(), search_spaces = grid[clf],  **tuner_params) 
----> 2 search_cv.fit(X,y)

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/searchcv.py:466, in BayesSearchCV.fit(self, X, y, groups, callback, **fit_params)
    463 else:
    464     self.optimizer_kwargs_ = dict(self.optimizer_kwargs)
--> 466 super().fit(X=X, y=y, groups=groups, **fit_params)
    468 # BaseSearchCV never ranked train scores,
    469 # but apparently we used to ship this (back-compat)
    470 if self.return_train_score:

File ~/.conda/envs/meth/lib/python3.9/site-packages/sklearn/model_selection/_search.py:875, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
    869     results = self._format_results(
    870         all_candidate_params, n_splits, all_out, all_more_results
    871     )
    873     return results
--> 875 self._run_search(evaluate_candidates)
    877 # multimetric is determined here because in the case of a callable
    878 # self.scoring the return type is only known after calling
    879 first_test_score = all_out[0]["test_scores"]

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/searchcv.py:512, in BayesSearchCV._run_search(self, evaluate_candidates)
    508 while n_iter > 0:
    509     # when n_iter < n_points points left for evaluation
    510     n_points_adjusted = min(n_iter, n_points)
--> 512     optim_result = self._step(
    513         search_space, optimizer,
    514         evaluate_candidates, n_points=n_points_adjusted
    515     )
    516     n_iter -= n_points
    518     if eval_callbacks(callbacks, optim_result):

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/searchcv.py:400, in BayesSearchCV._step(self, search_space, optimizer, evaluate_candidates, n_points)
    397 """Generate n_jobs parameters and evaluate them in parallel.
    398 """
    399 # get parameter values to evaluate
--> 400 params = optimizer.ask(n_points=n_points)
    402 # convert parameters to python native types
    403 params = [[np.array(v).item() for v in p] for p in params]

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:395, in Optimizer.ask(self, n_points, strategy)
    393 X = []
    394 for i in range(n_points):
--> 395     x = opt.ask()
    396     X.append(x)
    398     ti_available = "ps" in self.acq_func and len(opt.yi) > 0

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:367, in Optimizer.ask(self, n_points, strategy)
    336 """Query point or multiple points at which objective should be evaluated.
    337 
    338 n_points : int or None, default: None
   (...)
    364 
    365 """
    366 if n_points is None:
--> 367     return self._ask()
    369 supported_strategies = ["cl_min", "cl_mean", "cl_max"]
    371 if not (isinstance(n_points, int) and n_points > 0):

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:434, in Optimizer._ask(self)
    430 if self._n_initial_points > 0 or self.base_estimator_ is None:
    431     # this will not make a copy of `self.rng` and hence keep advancing
    432     # our random state.
    433     if self._initial_samples is None:
--> 434         return self.space.rvs(random_state=self.rng)[0]
    435     else:
    436         # The samples are evaluated starting form initial_samples[0]
    437         return self._initial_samples[
    438             len(self._initial_samples) - self._n_initial_points]

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/space.py:900, in Space.rvs(self, n_samples, random_state)
    897 columns = []
    899 for dim in self.dimensions:
--> 900     columns.append(dim.rvs(n_samples=n_samples, random_state=rng))
    902 # Transpose
    903 return _transpose_list_array(columns)

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/space.py:698, in Categorical.rvs(self, n_samples, random_state)
    696     return self.inverse_transform([(choices)])
    697 elif self.transform_ == "normalize":
--> 698     return self.inverse_transform(list(choices))
    699 else:
    700     return [self.categories[c] for c in choices]

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/space.py:685, in Categorical.inverse_transform(self, Xt)
    680 """Inverse transform samples from the warped space back into the
    681    original space.
    682 """
    683 # The concatenation of all transformed dimensions makes Xt to be
    684 # of type float, hence the required cast back to int.
--> 685 inv_transform = super(Categorical, self).inverse_transform(Xt)
    686 if isinstance(inv_transform, list):
    687     inv_transform = np.array(inv_transform)

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/space.py:168, in Dimension.inverse_transform(self, Xt)
    164 def inverse_transform(self, Xt):
    165     """Inverse transform samples from the warped space back into the
    166        original space.
    167     """
--> 168     return self.transformer.inverse_transform(Xt)

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/transformers.py:309, in Pipeline.inverse_transform(self, X)
    307 def inverse_transform(self, X):
    308     for transformer in self.transformers[::-1]:
--> 309         X = transformer.inverse_transform(X)
    310     return X

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/transformers.py:216, in LabelEncoder.inverse_transform(self, Xt)
    214 else:
    215     Xt = np.asarray(Xt)
--> 216 return [
    217     self.inverse_mapping_[int(np.round(i))] for i in Xt
    218 ]

File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/transformers.py:217, in <listcomp>(.0)
    214 else:
    215     Xt = np.asarray(Xt)
    216 return [
--> 217     self.inverse_mapping_[int(np.round(i))] for i in Xt
    218 ]

KeyError: 9

My versions:

python: 3.9.12 sklearn: 1.1.1 skopt: 0.9.0

The same error happen when using XGBClassifier or GradientBoostingClassifier, while there is no error using SVC or KNeighborsClassifier.

Solution

I believe that's related to how skopt encodes the hyperparameter space: it seems having identical points generated by your random lists are required to trigger the error, though sometimes it fits regardless. Either there are collisions or it makes the grid to be processed erroneously.

At least the issue stopped reproducing for me after changing all random lists to list(range(...)).

Might be worth a bug report.