I am trying to tune a XGBRegressor model and I am getting below error only when I try to use the parameter tuning flow:
Input contains infinity or a value too large for dtype('float32')
I do not get this error if I do not try to tune parameters.
I have ensured my data does not have any NaN or np.inf - I replace +/- np.inf with np.nan and replace all NaN with 0 later. Before training, I have changed all columns to np.float64 type.
I suspect during parameter tuning, the target value may be causing overflow with float32 - how to ensure sklearn/xgboost/optuna uses float64 instead of float32?
My training code is roughly following:
def __fit_new_model(
df: pd.DataFrame, feature_cols: List[str], target_col: str, tuning_iterations: int
) -> XGBRegressor:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
if df.empty:
raise AssertionError("input parameter error - empty input DataFrame")
if not feature_cols:
raise AssertionError("input parameter error - empty feature_cols")
if not target_col:
raise AssertionError("input parameter error - invalid target_col name")
X, y = df[feature_cols], df[target_col]
if X.isna().any().any() or y.isna().any():
raise AssertionError("input data error - NaN values exist")
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.20, random_state=42)
regressor: XGBRegressor = XGBRegressor(
random_state=42,
tree_method="hist",
n_estimators=100,
early_stopping_rounds=100,
objective="reg:tweedie",
tweedie_variance_power=1.5,
eval_metric=mean_absolute_percentage_error,
)
if tuning_iterations > 0:
tuned_parameters: Dict[str, Any] = __get_tuned_model_parames(
x_train=X_train,
y_train=y_train,
x_validation=X_validation,
y_validation=y_validation,
num_trials=tuning_iterations,
)
regressor = XGBRegressor(eval_metric=mean_absolute_percentage_error, **tuned_parameters)
regressor.fit(X=X_train, y=y_train, eval_set=[(X_train, y_train), (X_validation, y_validation)], verbose=False)
return regressor
def __get_tuned_model_parames(
x_train: pd.DataFrame,
y_train: pd.Series,
x_validation: pd.DataFrame,
y_validation: pd.Series,
num_trials: int = 200,
) -> Dict[str, Any]:
import optuna
def objective(trial: optuna.trial.Trial):
from sklearn.metrics import mean_absolute_percentage_error
param = {
"tree_method": "hist",
"booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
"lambda": trial.suggest_float("lambda", 1e-3, 10.0),
"alpha": trial.suggest_float("alpha", 1e-3, 10.0),
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
"subsample": trial.suggest_float("subsample", 0.05, 1.0),
"learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
"n_estimators": 100,
"objective": trial.suggest_categorical(
# "objective", ["reg:tweedie", "reg:squarederror", "reg:squaredlogerror"]
"objective",
["reg:tweedie"],
),
"max_depth": trial.suggest_int("max_depth", 1, 12),
"min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
"early_stopping_rounds": 100,
"random_state": 42,
"base_score": 0.5,
}
if param["objective"] == "reg:tweedie":
param["tweedie_variance_power"] = trial.suggest_categorical(
"tweedie_variance_power", [1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9]
)
regressor_model: XGBRegressor = XGBRegressor(**param)
regressor_model.fit(X=x_train, y=y_train, eval_set=[(x_validation, y_validation)], verbose=False)
predictions = regressor_model.predict(x_validation)
mape: float = mean_absolute_percentage_error(y_true=y_validation, y_pred=predictions)
return mape
sampler: optuna.samplers.TPESampler = optuna.samplers.TPESampler(seed=42)
study_xgb = optuna.create_study(direction="minimize", sampler=sampler)
optuna.logging.set_verbosity(optuna.logging.ERROR)
study_xgb.optimize(lambda trial: objective(trial), n_trials=num_trials)
model_params: Dict[str, Any] = study_xgb.best_params
return model_params
The distribution may not be a Poisson one with my data and hence removing the value 1 from the choices for tweedie_variance_power
resolved the issue. Maybe I made a typo earlier - instead of adding 1.1, I added 1.
The corresponding code section is now following:
if param["objective"] == "reg:tweedie":
param["tweedie_variance_power"] = trial.suggest_categorical(
"tweedie_variance_power",
[
1.1,
1.2,
1.3,
1.4,
1.5,
1.6,
1.7,
1.8,
1.9,
],
)