I want to optimize my HPO of my lightgbm model. I used a Bayesian Optimization process to do so. Sadly my algorithm fails to converge.
MRE
import warnings
import pandas as pd
import time
import numpy as np
warnings.filterwarnings("ignore")
import lightgbm as lgb
from bayes_opt import BayesianOptimization
import sklearn as sklearn
import pyprojroot
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
train = pd.DataFrame(housing['data'], columns=housing['feature_names'])
train_y = train.pop('MedInc')
params = {
"objective" : "regression", "bagging_fraction" : 0.8, "bagging_freq": 1,
"min_child_samples": 20, "reg_alpha": 1, "reg_lambda": 1,"boosting": "gbdt",
"learning_rate" : 0.01, "subsample" : 0.8, "colsample_bytree" : 0.8, "verbosity": -1, "metric" : 'rmse'
}
train_data = lgb.Dataset(train, train_y,free_raw_data=False)
def lgb_eval(num_leaves, feature_fraction, max_depth , min_gain_to_split, min_data_in_leaf):
params = {
"objective" : "regression", "bagging_fraction" : 0.8, "bagging_freq": 1,
"min_child_samples": 20, "reg_alpha": 1, "reg_lambda": 1,"boosting": "gbdt",
"learning_rate" : 0.01, "subsample" : 0.8, "colsample_bytree" : 0.8, "verbosity": -1, "metric" : 'rmse'
}
params['feature_fraction'] = max(min(feature_fraction, 1), 0)
params['max_depth'] = int(round(max_depth))
params['num_leaves'] = int(round(num_leaves))
params['min_gain_to_split'] = float(min_gain_to_split)
params['min_data_in_leaf'] = int(np.round(min_data_in_leaf))
cv_result = lgb.cv(params, train_data, nfold=5, seed=0, verbose_eval =200,stratified=False)
return ( np.array(cv_result['rmse-mean'])).max()
gbBO = BayesianOptimization(lgb_eval, {'feature_fraction': (0.1, 0.9),
'max_depth': (5, 9),
'num_leaves' : (1,300),
'min_gain_to_split': (0.001, 0.1),
'min_data_in_leaf': (5, 50)}, random_state=0)
lgbBO.maximize(init_points=5, n_iter=5,acq='ei')
def bayes_parameter_opt_lgb(train, train_y, init_round=15, opt_round=25, n_folds=5, random_seed=0, n_estimators=10000, learning_rate=0.05, output_process=False):
# prepare data
train_data = lgb.Dataset(train,train_y,free_raw_data=False)
# parameters
def lgb_eval(num_leaves, feature_fraction, max_depth , min_gain_to_split, min_data_in_leaf):
params = {
"objective" : "regression", "bagging_fraction" : 0.8, "bagging_freq": 1,
"min_child_samples": 20, "reg_alpha": 1, "reg_lambda": 1,"boosting": "gbdt",
"learning_rate" : 0.01, "subsample" : 0.8, "colsample_bytree" : 0.8, "verbosity": -1, "metric" : 'rmse'
}
params['feature_fraction'] = max(min(feature_fraction, 1), 0)
params['max_depth'] = int(round(max_depth))
params['num_leaves'] = int(round(num_leaves))
params['min_gain_to_split'] = float(min_gain_to_split),
params['min_data_in_leaf'] = int(np.round(min_data_in_leaf))
cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, verbose_eval =200,stratified=False)
return ( np.array(cv_result['rmse-mean'])).max()
# range
lgbBO = BayesianOptimization(lgb_eval, {'feature_fraction': (0.1, 0.9),
'max_depth': (5, 9),
'num_leaves' : (200,300),
'min_gain_to_split': (0.001, 0.1),
'min_data_in_leaf': (5, 50)}, random_state=0)
# optimize
lgbBO.maximize(init_points=init_round, n_iter=opt_round,acq='ei')
# output optimization process
lgbBO.points_to_csv("bayes_opt_result.csv")
# return best parameters
return lgbBO.res['max']['max_params']
opt_params = bayes_parameter_opt_lgb(train, train_y, init_round=200, opt_round=20, n_folds=5, random_seed=0, n_estimators=1000, learning_rate=0.01)
This leads to the following stacktrace :
---------------------------------------------------------------------------
StopIteration Traceback (most recent call last)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\bayes_opt\bayesian_optimization.py:179, in BayesianOptimization.maximize(self, init_points, n_iter, acq, kappa, kappa_decay, kappa_decay_delay, xi, **gp_params)
178 try:
--> 179 x_probe = next(self._queue)
180 except StopIteration:
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\bayes_opt\bayesian_optimization.py:25, in Queue.__next__(self)
24 if self.empty:
---> 25 raise StopIteration("Queue is empty, no more objects to retrieve.")
26 obj = self._queue[0]
StopIteration: Queue is empty, no more objects to retrieve.
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
..\GitHub\Meister2\src\lgb_new.ipynb Cell 13' in <cell line: 35>()
32 # return best parameters
33 return lgbBO.res['max']['max_params']
---> 35 opt_params = bayes_parameter_opt_lgb(train, train_y, init_round=20, opt_round=20, n_folds=5, random_seed=0, n_estimators=1000, learning_rate=0.01)
..\GitHub\Meister2\src\lgb_new.ipynb Cell 13' in bayes_parameter_opt_lgb(train, train_y, init_round, opt_round, n_folds, random_seed, n_estimators, learning_rate, output_process)
21 lgbBO = BayesianOptimization(lgb_eval, {'feature_fraction': (0.1, 0.9),
22 'max_depth': (5, 9),
23 'num_leaves' : (200,300),
24 'min_gain_to_split': (0.001, 0.1),
25 'min_data_in_leaf': (5, 50)}, random_state=0)
26 # optimize
---> 27 lgbBO.maximize(init_points=init_round, n_iter=opt_round,acq='ei')
29 # output optimization process
30 lgbBO.points_to_csv("bayes_opt_result.csv")
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\bayes_opt\bayesian_optimization.py:182, in BayesianOptimization.maximize(self, init_points, n_iter, acq, kappa, kappa_decay, kappa_decay_delay, xi, **gp_params)
180 except StopIteration:
181 util.update_params()
--> 182 x_probe = self.suggest(util)
183 iteration += 1
185 self.probe(x_probe, lazy=False)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\bayes_opt\bayesian_optimization.py:131, in BayesianOptimization.suggest(self, utility_function)
128 self._gp.fit(self._space.params, self._space.target)
130 # Finding argmax of the acquisition function.
--> 131 suggestion = acq_max(
132 ac=utility_function.utility,
133 gp=self._gp,
134 y_max=self._space.target.max(),
135 bounds=self._space.bounds,
136 random_state=self._random_state
137 )
139 return self._space.array_to_params(suggestion)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\bayes_opt\util.py:65, in acq_max(ac, gp, y_max, bounds, random_state, n_warmup, n_iter)
62 continue
64 # Store it if better than previous minimum(maximum).
---> 65 if max_acq is None or -res.fun[0] >= max_acq:
66 x_max = res.x
67 max_acq = -res.fun[0]
TypeError: 'float' object is not subscriptable
EDIT : The MRE above the stacktrace should lead to the followed programming error. As the stacktrace implies, it looks like that -res.fun[0]
should be a list and therefore subscriptable (line 65, end of the stacktrace) but it is not and I can't understand why.
This list is assigned to max_acq
which is part of the maximization function acq_max()
(line 131 of the stacktrace) the Gaussian Process which is itself part of the BayesianOptimization
function (line 27 of the stacktrace)
Why am I getting TypeError: 'float' object is not subscriptable
and how can this be fixed?
This is related to a change in scipy 1.8.0,
One should use -np.squeeze(res.fun)
instead of -res.fun[0]
https://github.com/fmfn/BayesianOptimization/issues/300
The comments in the bug report indicate reverting to scipy 1.7.0 fixes this,
UPDATED: It seems the fix has been merged in the BayesianOptimization package, but the new maintainer is unable to push a release to pypi https://github.com/fmfn/BayesianOptimization/issues/300#issuecomment-1146903850
so you could either:
pip install git+https://github.com/fmfn/BayesianOptimization