python scikit-learn classification confusion-matrix multiclass-classification

Why do predictions and scores return different results in classification using scikit-learn?

I wrote a very simple multiclass classifier based on the iris dataset. This is the code:

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

# Load the data
iris = load_iris()
X = iris.data
y = iris.target

# Use label_binarize to be multi-label like settings
Y = label_binarize(y, classes=[0, 1, 2])
n_classes = Y.shape[1]

# Add noisy features
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)
from sklearn.preprocessing import label_binarize

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.5, random_state=0 
)

# Create classifier
classifier = OneVsRestClassifier(
    make_pipeline(StandardScaler(), LinearSVC(random_state=random_state))
)

# Train the model
classifier.fit(X_train, y_train)

My goal is to predict the values of the test set in 2 ways:

Using the classifier.predict() function and define y_pred.
Using the classifier.decision_function() to get the scores and then pick the highest one for each instance and define y_pred_.

Here is how I did it:

# Get the scores for the Test set
y_score = classifier.decision_function(X_test)

# Make predictions
y_pred  = classifier.predict(X_test)
y_pred_ = label_binarize(np.argmax(y_score, axis=1), [0,1,2])

It looks like however that when I try to compute the classification report I get slightly different results, while I would expect to be the same since the predictions are based on the scores obtained from the decision function as it can be seen in the documentation (line 789). Here are both reports:

print(classification_report(y_test, y_pred))
print(classification_report(y_test, y_pred_))

              precision    recall  f1-score   support

           0       0.54      0.62      0.58        21
           1       0.44      0.40      0.42        30
           2       0.36      0.50      0.42        24

   micro avg       0.44      0.49      0.47        75
   macro avg       0.45      0.51      0.47        75
weighted avg       0.45      0.49      0.46        75
 samples avg       0.39      0.49      0.42        75

              precision    recall  f1-score   support

           0       0.42      0.38      0.40        21
           1       0.52      0.47      0.49        30
           2       0.38      0.46      0.42        24

   micro avg       0.44      0.44      0.44        75
   macro avg       0.44      0.44      0.44        75
weighted avg       0.45      0.44      0.44        75
 samples avg       0.44      0.44      0.44        75

What am I doing wrong? Would you be able to suggest a smart and elegant solution so that both reports are identical?

Solution

For multilabel classification you should use

y_pred_ = np.where(classifier.decision_function(X_test) > 0, 1, 0)

to replicate the output of the predict() method as in this case the different classes are not mutually exclusive, i.e. a given sample can belong to multiple classes.

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# Load the data
iris = load_iris()
X = iris.data
y = label_binarize(iris.target, classes=[0, 1, 2])

# Split the data into training and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0
)

# Create classifier
classifier = OneVsRestClassifier(
    make_pipeline(StandardScaler(), LinearSVC(random_state=0))
)

# Train the model
classifier.fit(X_train, y_train)

# Make predictions
y_pred  = classifier.predict(X_test)
y_pred_ = np.where(classifier.decision_function(X_test) > 0, 1, 0)

print(classification_report(y_test, y_pred))
#               precision    recall  f1-score   support
#            0       1.00      1.00      1.00        21
#            1       0.58      0.37      0.45        30
#            2       0.95      0.83      0.89        24
#    micro avg       0.85      0.69      0.76        75
#    macro avg       0.84      0.73      0.78        75
# weighted avg       0.82      0.69      0.74        75
#  samples avg       0.66      0.69      0.67        75

print(classification_report(y_test, y_pred_))
#               precision    recall  f1-score   support
#            0       1.00      1.00      1.00        21
#            1       0.58      0.37      0.45        30
#            2       0.95      0.83      0.89        24
#    micro avg       0.85      0.69      0.76        75
#    macro avg       0.84      0.73      0.78        75
# weighted avg       0.82      0.69      0.74        75
#  samples avg       0.66      0.69      0.67        75

For multiclass classification you can instead use

y_pred_ = np.argmax(classifier.decision_function(X_test), axis=1)

as in your code, as in this case the different classes are mutually exclusive, i.e. each sample is assigned to only one class.

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# Load the data
iris = load_iris()
X = iris.data
y = iris.target

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0
)

# Create classifier
classifier = OneVsRestClassifier(
    make_pipeline(StandardScaler(), LinearSVC(random_state=0))
)

# Train the model
classifier.fit(X_train, y_train)

# Make predictions
y_pred  = classifier.predict(X_test)
y_pred_ = np.argmax(classifier.decision_function(X_test), axis=1)

print(classification_report(y_test, y_pred))
#               precision    recall  f1-score   support
#            0       1.00      1.00      1.00        21
#            1       0.85      0.73      0.79        30
#            2       0.71      0.83      0.77        24
#     accuracy                           0.84        75
#    macro avg       0.85      0.86      0.85        75
# weighted avg       0.85      0.84      0.84        75

print(classification_report(y_test, y_pred_))
#               precision    recall  f1-score   support
#            0       1.00      1.00      1.00        21
#            1       0.85      0.73      0.79        30
#            2       0.71      0.83      0.77        24
#     accuracy                           0.84        75
#    macro avg       0.85      0.86      0.85        75
# weighted avg       0.85      0.84      0.84        75