Search code examples
pythonmachine-learningxgboostgrid-searchshap

Retrieve Model Results for Shapley values from GridSearch CV


I have tuned a model using GridSearchCV. Now I would like to calculate the Shapley values and visualize them. The difficulty is that the shap package excepts a model, not the GridSearch Results. Likewise it does not like when I pass it the best_estimator_ attribute. It says the model is not supported. How can I get the Shapley values from the GridSearchCV or something to calculate the Shapley values. One of my columns is categorical, hence the need for preprocessing. Since I have the best_params from the Grid Search I could run the model as an xgboost_regressor model, but it has been a while since doing this without preprocessing.

from xgboost import XGBRegressor as xgr    
model=xgr(booster ='gbtree', random_state = 13)
cv_inner = KFold(n_splits=5, shuffle=True)
params = {
        'model__n_estimators' : [1500,2000]
         ,'model__learning_rate' : [0.1,0.2,0.3]
         ,'model__gamma' : [0, 0.005,0.01]
         ,'model__lambda' : [0.1, 0.2,0.3]
         ,'model__alpha' : [0, 0.001, 0.05]
         ,'model__max_depth' : [6]
         ,'model__min_child_weight' : [1]
         ,'model__subsample' : [0.8]
    }
preprocessor = ColumnTransformer(
                    transformers=[
                        ('cat', OneHotEncoder(), [0])
                    ]
                    ,remainder = 'passthrough')
mymodel = Pipeline(steps = [
                        ('preprocessor',preprocessor),
                        ('model', model)
                        ])
optimize_hparams = GridSearchCV(
    estimator = mymodel, param_grid=params, n_jobs = -1,
    cv=cv_inner, scoring='neg_mean_absolute_error')
optimize_hparams.fit(X, y)
import shap
shap_values = shap.TreeExplainer(optimize_hparams.best_estimator_['model']).shap_values(X)
shap.summary_plot(shap_values, X, plot_type="bar")

Solution

  • You need to fit both the preprocessor and the best model from the grid search to the data before calculating the Shap values, see the code below for an example.

    import shap
    import numpy as np
    from sklearn.datasets import make_regression
    from sklearn.model_selection import KFold, GridSearchCV
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.pipeline import Pipeline
    from xgboost import XGBRegressor as xgr 
    
    # generate the features and target
    X, y = make_regression(n_samples=100, n_features=5, random_state=100)
    
    # add a categorical feature in the first column
    X = np.hstack([np.random.choice(['a', 'b', 'c'], size=(X.shape[0], 1)), X])
    
    # set up the grid search
    model = xgr(booster='gbtree', random_state=13)
    
    cv_inner = KFold(n_splits=5, shuffle=True)
    
    params = {
        'model__n_estimators' : [1500, 2000],
        'model__learning_rate' : [0.1, 0.2, 0.3],
        'model__gamma' : [0, 0.005, 0.01],
        'model__lambda' : [0.1, 0.2, 0.3],
        'model__alpha' : [0, 0.001, 0.05],
        'model__max_depth' : [6],
        'model__min_child_weight' : [1],
        'model__subsample' : [0.8],
    }
    
    preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(), [0])], remainder='passthrough')
    
    mymodel = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    optimize_hparams = GridSearchCV(
        estimator=mymodel, 
        param_grid=params, 
        cv=cv_inner, 
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
    )
    
    # run the grid search
    optimize_hparams.fit(X, y)
    
    # fit the preprocessor 
    X_encoded = optimize_hparams.best_estimator_['preprocessor'].fit_transform(X)
    
    # fit the model 
    best_model = optimize_hparams.best_estimator_['model'].fit(X_encoded, y)
    
    # calculate the Shap values
    shap_values = shap.TreeExplainer(best_model).shap_values(X_encoded)
    
    # plot the Shap values
    shap.summary_plot(shap_values, X_encoded, plot_type='bar')