I am trying to understand why from my code below, most of my shap
expected decision lines show as misclassified? For my code below, this is the misclassified responses that I get:
Can you assist me in possibly fixing my error to highlight the correct miclassified data?
Here is my code:
from sklearn.datasets import make_classification
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import pickle
import joblib
import warnings
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
X_train,y_train = make_classification(n_samples=500,
n_features=20,
n_informative=9,
n_redundant=0,
n_repeated=0,
n_classes=10,
n_clusters_per_class=1,
class_sep=9,
flip_y=0.2,
#weights=[0.5,0.5],
random_state=17)
X_test,y_test = make_classification(n_samples=100,
n_features=20,
n_informative=9,
n_redundant=0,
n_repeated=0,
n_classes=10,
n_clusters_per_class=1,
class_sep=9,
flip_y=0.2,
#weights=[0.5,0.5],
random_state=17)
model = RandomForestClassifier()
parameter_space = {
'n_estimators': [10,50,100],
'criterion': ['gini', 'entropy'],
'max_depth': np.linspace(10,50,11),
}
clf = GridSearchCV(model, parameter_space, cv = 5, scoring = "accuracy", verbose = True) # model
my_model = clf.fit(X_train,y_train)
explainer = shap.Explainer(clf.best_estimator_)
expected_value = explainer.expected_value
if isinstance(expected_value, list):
expected_value = expected_value[1]
print(f"Explainer expected value: {expected_value}")
select = range(30)
features = X_test[select]
with warnings.catch_warnings():
warnings.simplefilter("ignore")
shap_values = explainer.shap_values(features)[1]
shap_interaction_values = explainer.shap_interaction_values(features)
if isinstance(shap_interaction_values, list):
shap_interaction_values = shap_interaction_values[1]
shap.decision_plot(expected_value[2], shap_values, features)
predictions = clf.best_estimator_.predict(X_test)
y_pred = predictions
y_true = y_test
# Our naive cutoff point is zero log odds (probability 0.5).
y_pred = (shap_values.sum(1) + expected_value[0]) > 0
misclassified = y_pred != y_test[select]
shap.decision_plot(expected_value[0], shap_values, features, highlight=misclassified)
You may wish to try the below:
import shap
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
X, y = make_classification(n_samples=500,
n_features=20,
n_informative=9,
n_redundant=0,
n_repeated=0,
n_classes=10,
n_clusters_per_class=1,
class_sep=9,
flip_y=0.2,
random_state=17)
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=.75, random_state=42)
model = RandomForestClassifier()
parameter_space = {
'n_estimators': [10,50,100],
'criterion': ['gini', 'entropy'],
'max_depth': [3,5,10]
}
clf = GridSearchCV(model, parameter_space, cv = 5, scoring = "accuracy", verbose = True) # model
my_model = clf.fit(X_train,y_train)
explainer = shap.Explainer(clf.best_estimator_)
expected_value = explainer.expected_value
shap_values = explainer.shap_values(X_test)
sv = np.array(shap_values)
preds = np.argmax(sv.sum(2).T + expected_value,1)
misclassified = (preds != y_test).astype(int)
idx = 9
print("Misclassified: ", misclassified[idx])
shap.multioutput_decision_plot(
list(expected_value),
shap_values,
idx, highlight=misclassified)