Search code examples
pythonscikit-learnshufflemulticlass-classification

Classification performance degrades when rows from test dataset are shuffled


Why does the classification performance degrades when I shuffle the test dataset?

For replication purposes: I created an imbalanced dataset:

n = 1
centers=[[0.0, -5, 2.5], [0, 0,2.5], [0, 5,2.5]]
cluster_std = [1.0, 1.0,1.0]
X, y = make_blobs(n_samples=[250,24500,250], centers=centers, cluster_std=cluster_std,n_features=len(cluster_std), random_state = n)
dataset_x = pd.DataFrame({'var1': X[:, 0], 'var2': X[:, 1],'var3': X[:, 2]})
dataset_y = pd.DataFrame({'target': y})
simulated_blob_dataset = pd.concat([dataset_x,dataset_y], axis=1)

I split the dataset into training and testing:

training_data, testing_data = data_split(raw_data=simulated_blob_dataset,target_variable_name="target",test_size=0.2)

I created a base models:

def base_models():
    models = dict()
    models['rf'] = RandomForestClassifier(n_jobs=-1)
    models['gbm'] = GradientBoostingClassifier()
    models['dt'] = DecisionTreeClassifier()
    models['svc'] = SVC()
    models['knn'] = KNeighborsClassifier(n_jobs=-1)
    models['nb'] = GaussianNB()
    models['SE_rf'] = stack_ensemble_1()
    models['SE_gbm'] = stack_ensemble_2()
    models['SE_dt'] = stack_ensemble_3()
    models['SE_svc'] = stack_ensemble_4()
    models['SE_knn'] = stack_ensemble_5()
    models['SE_nb'] = stack_ensemble_6()
    return models
 
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='balanced_accuracy', cv=cv,n_jobs=-1, error_score='raise')
    return scores

def stack_ensemble_1():
    # define the base models
    level0 = list()
    level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
    level0.append(('gbm', GradientBoostingClassifier()))
    level0.append(('dt', DecisionTreeClassifier()))
    level0.append(('svc', SVC()))
    level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
    level0.append(('nb', GaussianNB()))
    # define meta learner model
    level1 = RandomForestClassifier(n_jobs=-1)
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use n_job= -1 for all cores
    return model


def stack_ensemble_2():
    # define the base models
    level0 = list()
    level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
    level0.append(('gbm', GradientBoostingClassifier()))
    level0.append(('dt', DecisionTreeClassifier()))
    level0.append(('svc', SVC()))
    level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
    level0.append(('nb', GaussianNB()))
    # define meta learner model
    level1 = GradientBoostingClassifier()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
    return model

def stack_ensemble_3():
    # define the base models
    level0 = list()
    level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
    level0.append(('gbm', GradientBoostingClassifier()))
    level0.append(('dt', DecisionTreeClassifier()))
    level0.append(('svc', SVC()))
    level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
    level0.append(('nb', GaussianNB()))
    # define meta learner model
    level1 = DecisionTreeClassifier()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use n_jobs = -1 for all cores
    return model

def stack_ensemble_4():
    # define the base models
    level0 = list()
    level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
    level0.append(('gbm', GradientBoostingClassifier()))
    level0.append(('dt', DecisionTreeClassifier()))
    level0.append(('svc', SVC()))
    level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
    level0.append(('nb', GaussianNB()))
    # define meta learner model
    level1 =  SVC()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
    return model

def stack_ensemble_5():
    # define the base models
    level0 = list()
    level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
    level0.append(('gbm', GradientBoostingClassifier()))
    level0.append(('dt', DecisionTreeClassifier()))
    level0.append(('svc', SVC()))
    level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
    level0.append(('nb', GaussianNB()))
    # define meta learner model
    level1 = KNeighborsClassifier(n_jobs=-1)
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
    return model

def stack_ensemble_6():
    # define the base models
    level0 = list()
    level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
    level0.append(('gbm', GradientBoostingClassifier()))
    level0.append(('dt', DecisionTreeClassifier()))
    level0.append(('svc', SVC()))
    level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
    level0.append(('nb', GaussianNB()))
    # define meta learner model
    level1 = GaussianNB()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
    return model

First I tried to run it the normal way (the rows from the test dataset is not reshuffled):

X, y = training_data[['var1', 'var2', 'var3']].values,training_data['target'].values
models = base_models()
results, names = list(), list()
for name, model in models.items():
    print(name)
    clf = model.fit(X,y.ravel())
    y_pred = clf.predict(testing_data[['var1', 'var2', 'var3']].values)
    cnf_matrix = confusion_matrix(testing_data['target'].values, y_pred)
    print(cnf_matrix)

the results are good:

enter image description here

however, when I reshuffled the rows of the test dataset, to check the robustness of the models[ by creating a different angle of the same truth]:

X, y = training_data[['var1', 'var2', 'var3']].values,training_data['target'].values
models = base_models()
results, names = list(), list()
for name, model in models.items():
    print(name)
    clf = model.fit(X,y.ravel())
    y_pred = clf.predict(testing_data[['var1', 'var2', 'var3']].values).sample(frac=1,replace= False,random_state=1).reset_index(drop = True).values)
    cnf_matrix = confusion_matrix(testing_data['target'].values, y_pred)
    print(cnf_matrix)

the result degraded:

enter image description here


Solution

  • Your shuffling procedure is wrong: you only shuffle the predicted values, leaving the true ones as-is; this breaks the 1-1 correspondence between the predicted and true values, and it is guaranteed to lead to such nonsensical results.

    You need to shuffle the true and predicted values in tandem using scikit-learn's shuffle utility method. Here is an example using your own data and an RF classifier:

    import pandas as pd
    import numpy as np
    from sklearn.datasets import make_blobs
    from sklearn.utils import shuffle
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix
    
    # your data as-is:
    n = 1
    centers=[[0.0, -5, 2.5], [0, 0,2.5], [0, 5,2.5]]
    cluster_std = [1.0, 1.0,1.0]
    X, y = make_blobs(n_samples=[250,24500,250], centers=centers, cluster_std=cluster_std,n_features=len(cluster_std), random_state = n)
    dataset_x = pd.DataFrame({'var1': X[:, 0], 'var2': X[:, 1],'var3': X[:, 2]})
    dataset_y = pd.DataFrame({'target': y})
    simulated_blob_dataset = pd.concat([dataset_x,dataset_y], axis=1)
    
    # train-test split using scikit-learn, as data_split is of unknown origin:
    training_data, testing_data = train_test_split(simulated_blob_dataset, test_size=0.2)
    
    # fit & predict
    rf = RandomForestClassifier(n_jobs=-1)
    X, y = training_data[['var1', 'var2', 'var3']].values,training_data['target'].values
    rf.fit(X,y.ravel())
    y_pred = rf.predict(testing_data[['var1', 'var2', 'var3']].values)
    cm = confusion_matrix(testing_data['target'].values, y_pred)
    print(cm)
    

    The resulting confusion matrix cm is:

    [[  42    2    0]
     [   3 4896    0]
     [   0    3   54]]
    

    Now, shuffle correctly using shuffle:

    y_true_shuffled, y_pred_shuffled = shuffle(testing_data['target'].values, y_pred)
    cm_shuffled = confusion_matrix(y_true_shuffled, y_pred_shuffled)
    print(cm_shuffled)
    

    and the resulting confusion matrix cm_shuffled is:

    [[  42    2    0]
     [   3 4896    0]
     [   0    3   54]]
    

    with

    np.all(cm==cm_shuffled)
    # True