Search code examples
python-3.xshap

Why does all my predicted SHAP values come out as misclassified?


I am trying to understand why from my code below, most of my shap expected decision lines show as misclassified? For my code below, this is the misclassified responses that I get:

enter image description here

Can you assist me in possibly fixing my error to highlight the correct miclassified data?

Here is my code:

from sklearn.datasets import make_classification
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import pickle
import joblib
import warnings

import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

X_train,y_train = make_classification(n_samples=500, 
                          n_features=20, 
                          n_informative=9, 
                          n_redundant=0, 
                          n_repeated=0, 
                          n_classes=10, 
                          n_clusters_per_class=1,
                          class_sep=9,
                          flip_y=0.2,
                          #weights=[0.5,0.5], 
                          random_state=17)

X_test,y_test = make_classification(n_samples=100, 
                          n_features=20, 
                          n_informative=9, 
                          n_redundant=0, 
                          n_repeated=0, 
                          n_classes=10, 
                          n_clusters_per_class=1,
                          class_sep=9,
                          flip_y=0.2,
                          #weights=[0.5,0.5], 
                          random_state=17)

model = RandomForestClassifier()

parameter_space = {
    'n_estimators': [10,50,100],
    'criterion': ['gini', 'entropy'],
    'max_depth': np.linspace(10,50,11),
}

clf = GridSearchCV(model, parameter_space, cv = 5, scoring = "accuracy", verbose = True) # model
my_model = clf.fit(X_train,y_train)

explainer = shap.Explainer(clf.best_estimator_)
expected_value = explainer.expected_value
if isinstance(expected_value, list):
    expected_value = expected_value[1]
print(f"Explainer expected value: {expected_value}")

select = range(30)
features = X_test[select]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    shap_values = explainer.shap_values(features)[1]
    shap_interaction_values = explainer.shap_interaction_values(features)
if isinstance(shap_interaction_values, list):
    shap_interaction_values = shap_interaction_values[1]


shap.decision_plot(expected_value[2], shap_values, features)

predictions = clf.best_estimator_.predict(X_test)
y_pred = predictions
y_true = y_test

# Our naive cutoff point is zero log odds (probability 0.5).
y_pred = (shap_values.sum(1) + expected_value[0]) > 0
misclassified = y_pred != y_test[select]
shap.decision_plot(expected_value[0], shap_values, features, highlight=misclassified)

Solution

  • You may wish to try the below:

    import shap
    from sklearn.datasets import make_classification
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
    
    X, y = make_classification(n_samples=500, 
                              n_features=20, 
                              n_informative=9, 
                              n_redundant=0, 
                              n_repeated=0, 
                              n_classes=10, 
                              n_clusters_per_class=1,
                              class_sep=9,
                              flip_y=0.2,
                              random_state=17)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=.75, random_state=42)
    
    
    model = RandomForestClassifier()
    
    parameter_space = {
        'n_estimators': [10,50,100],
        'criterion': ['gini', 'entropy'],
        'max_depth': [3,5,10]
    }
    
    clf = GridSearchCV(model, parameter_space, cv = 5, scoring = "accuracy", verbose = True) # model
    my_model = clf.fit(X_train,y_train)
    
    explainer = shap.Explainer(clf.best_estimator_)
    expected_value = explainer.expected_value
    shap_values = explainer.shap_values(X_test)
    
    sv = np.array(shap_values)
    preds = np.argmax(sv.sum(2).T + expected_value,1)
    
    misclassified = (preds != y_test).astype(int)
    idx = 9
    print("Misclassified: ", misclassified[idx])
    
    shap.multioutput_decision_plot(
        list(expected_value),
        shap_values, 
        idx, highlight=misclassified)
    

    enter image description here