Search code examples
pythonscikit-learnxgboostoptunaxgbregressor

optuna parameter tuning for tweedie - Input contains infinity or a value too large for dtype('float32') error


I am trying to tune a XGBRegressor model and I am getting below error only when I try to use the parameter tuning flow:

Input contains infinity or a value too large for dtype('float32')

I do not get this error if I do not try to tune parameters.

I have ensured my data does not have any NaN or np.inf - I replace +/- np.inf with np.nan and replace all NaN with 0 later. Before training, I have changed all columns to np.float64 type.

I suspect during parameter tuning, the target value may be causing overflow with float32 - how to ensure sklearn/xgboost/optuna uses float64 instead of float32?

My training code is roughly following:

def __fit_new_model(
        df: pd.DataFrame, feature_cols: List[str], target_col: str, tuning_iterations: int
    ) -> XGBRegressor:
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import mean_absolute_percentage_error

        if df.empty:
            raise AssertionError("input parameter error - empty input DataFrame")

        if not feature_cols:
            raise AssertionError("input parameter error - empty feature_cols")

        if not target_col:
            raise AssertionError("input parameter error - invalid target_col name")

        X, y = df[feature_cols], df[target_col]

        if X.isna().any().any() or y.isna().any():
            raise AssertionError("input data error - NaN values exist")

        X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.20, random_state=42)

        regressor: XGBRegressor = XGBRegressor(
            random_state=42,
            tree_method="hist",
            n_estimators=100,
            early_stopping_rounds=100,
            objective="reg:tweedie",
            tweedie_variance_power=1.5,
            eval_metric=mean_absolute_percentage_error,
        )

        if tuning_iterations > 0:
            tuned_parameters: Dict[str, Any] = __get_tuned_model_parames(
                x_train=X_train,
                y_train=y_train,
                x_validation=X_validation,
                y_validation=y_validation,
                num_trials=tuning_iterations,
            )
            regressor = XGBRegressor(eval_metric=mean_absolute_percentage_error, **tuned_parameters)

        regressor.fit(X=X_train, y=y_train, eval_set=[(X_train, y_train), (X_validation, y_validation)], verbose=False)

        return regressor

def __get_tuned_model_parames(
        x_train: pd.DataFrame,
        y_train: pd.Series,
        x_validation: pd.DataFrame,
        y_validation: pd.Series,
        num_trials: int = 200,
    ) -> Dict[str, Any]:
        import optuna

        def objective(trial: optuna.trial.Trial):
            from sklearn.metrics import mean_absolute_percentage_error

            param = {
                "tree_method": "hist",
                "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
                "lambda": trial.suggest_float("lambda", 1e-3, 10.0),
                "alpha": trial.suggest_float("alpha", 1e-3, 10.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
                "subsample": trial.suggest_float("subsample", 0.05, 1.0),
                "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
                "n_estimators": 100,
                "objective": trial.suggest_categorical(
                    # "objective", ["reg:tweedie", "reg:squarederror", "reg:squaredlogerror"]
                    "objective",
                    ["reg:tweedie"],
                ),
                "max_depth": trial.suggest_int("max_depth", 1, 12),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
                "early_stopping_rounds": 100,
                "random_state": 42,
                "base_score": 0.5,
            }

            if param["objective"] == "reg:tweedie":
                param["tweedie_variance_power"] = trial.suggest_categorical(
                    "tweedie_variance_power", [1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9]
                )

            regressor_model: XGBRegressor = XGBRegressor(**param)
            regressor_model.fit(X=x_train, y=y_train, eval_set=[(x_validation, y_validation)], verbose=False)

            predictions = regressor_model.predict(x_validation)
            mape: float = mean_absolute_percentage_error(y_true=y_validation, y_pred=predictions)

            return mape

        sampler: optuna.samplers.TPESampler = optuna.samplers.TPESampler(seed=42)
        study_xgb = optuna.create_study(direction="minimize", sampler=sampler)
        optuna.logging.set_verbosity(optuna.logging.ERROR)

        study_xgb.optimize(lambda trial: objective(trial), n_trials=num_trials)
        model_params: Dict[str, Any] = study_xgb.best_params

        return model_params


Solution

  • The distribution may not be a Poisson one with my data and hence removing the value 1 from the choices for tweedie_variance_power resolved the issue. Maybe I made a typo earlier - instead of adding 1.1, I added 1.

    The corresponding code section is now following:

                if param["objective"] == "reg:tweedie":
                    param["tweedie_variance_power"] = trial.suggest_categorical(
                        "tweedie_variance_power",
                        [
                            1.1,
                            1.2,
                            1.3,
                            1.4,
                            1.5,
                            1.6,
                            1.7,
                            1.8,
                            1.9,
                        ],
                    )