Search code examples
pythonmachine-learningscikit-learnknn

Why does KNeighborsClassifier always predict the same number?


Why does knn always predict the same number? How can I solve this? The dataset is here.

Code:

import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import scipy.io   
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from sklearn import preprocessing
import torch
import numpy as np
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

def load_mat_data(path):
    mat = scipy.io.loadmat(DATA_PATH)
    x,y = mat['data'], mat['class']
    x = x.astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x = standardizer.fit_transform(x) 
    return x, standardizer, y

def numpyToTensor(x):
    x_train = torch.from_numpy(x)
    return x_train

class DataBuilder(Dataset):
    def __init__(self, path):
        self.x, self.standardizer, self.y = load_mat_data(DATA_PATH)
        self.x = numpyToTensor(self.x)
        self.len=self.x.shape[0]
        self.y = numpyToTensor(self.y)
    def __getitem__(self,index):      
        return (self.x[index], self.y[index])
    def __len__(self):
        return self.len

datasets = ['/home/katerina/Desktop/datasets/GSE75110.mat']

for DATA_PATH in datasets:

    print(DATA_PATH)
    data_set=DataBuilder(DATA_PATH)

    pred_rpknn = [0] * len(data_set.y)
    kf = KFold(n_splits=10, shuffle = True, random_state=7)

    for train_index, test_index in kf.split(data_set.x):
        #Create KNN Classifier
        knn = KNeighborsClassifier(n_neighbors=5)
        #print("TRAIN:", train_index, "TEST:", test_index)
        x_train, x_test = data_set.x[train_index], data_set.x[test_index]
        y_train, y_test = data_set.y[train_index], data_set.y[test_index]
        #Train the model using the training sets
        y1_train = y_train.ravel()
        knn.fit(x_train, y1_train)
        #Predict the response for test dataset
        y_pred = knn.predict(x_test)
        #print(y_pred)
        # Model Accuracy, how often is the classifier correct?
        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
        c = 0
        for idx in test_index:
            pred_rpknn[idx] = y_pred[c]
            c +=1
    print("Accuracy:",metrics.accuracy_score(data_set.y, pred_rpknn))
    print(pred_rpknn, data_set.y.reshape(1,-1))

Output:

/home/katerina/Desktop/datasets/GSE75110.mat
Accuracy: 0.2857142857142857
Accuracy: 0.38095238095238093
Accuracy: 0.14285714285714285
Accuracy: 0.4
Accuracy: 0.3
Accuracy: 0.25
Accuracy: 0.3
Accuracy: 0.6
Accuracy: 0.25
Accuracy: 0.45
Accuracy: 0.33497536945812806
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

I am trying to combine knn with k fold in order to test the whole dataset using 10 folds. The problem is that knn always predicts arrays of 3's for each fold. The classes I want to predict are these:

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]]


Solution

  • TL;DR
    It have to do with the StandardScaler, change it to a simple normalisation.
    e.g.

    from sklearn import preprocessing
    
    ...
    
    x = preprocessing.normalize(x)
    

    Explanation:

    Standard Scalar as you use it will do:

    The standard score of a sample `x` is calculated as:
    
        z = (x - u) / s
    
    where `u` is the mean of the training samples or zero if `with_mean=False`,
    and `s` is the standard deviation of the training samples or one if
    `with_std=False`.
    

    When you actually want this features to help KNN to decide which vector is closer.

    in normalize the normalization happen for each vector separately so it doesn't effect and even help the KNN to differentiate the vectors

    With KNN StandardScaler can actually harm your prediction. It is better to use it in other forms of data.

    import scipy.io
    from torch.utils.data import Dataset
    from sklearn import preprocessing
    import torch
    import numpy as np
    from sklearn.model_selection import KFold
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn import metrics
    
    def load_mat_data(path):
        mat = scipy.io.loadmat(DATA_PATH)
        x, y = mat['data'], mat['class']
        x = x.astype('float32')
        # stadardize values
        x = preprocessing.normalize(x)
        return x, y
    
    def numpyToTensor(x):
        x_train = torch.from_numpy(x)
        return x_train
    
    class DataBuilder(Dataset):
        def __init__(self, path):
            self.x, self.y = load_mat_data(DATA_PATH)
            self.x = numpyToTensor(self.x)
            self.len=self.x.shape[0]
            self.y = numpyToTensor(self.y)
        def __getitem__(self,index):
            return (self.x[index], self.y[index])
        def __len__(self):
            return self.len
    
    datasets = ['/home/katerina/Desktop/datasets/GSE75110.mat']
    
    for DATA_PATH in datasets:
    
        print(DATA_PATH)
        data_set=DataBuilder(DATA_PATH)
    
        pred_rpknn = [0] * len(data_set.y)
        kf = KFold(n_splits=10, shuffle = True, random_state=7)
    
        for train_index, test_index in kf.split(data_set.x):
            #Create KNN Classifier
            knn = KNeighborsClassifier(n_neighbors=5)
            #print("TRAIN:", train_index, "TEST:", test_index)
            x_train, x_test = data_set.x[train_index], data_set.x[test_index]
            y_train, y_test = data_set.y[train_index], data_set.y[test_index]
            #Train the model using the training sets
            y1_train = y_train.view(-1)
            knn.fit(x_train, y1_train)
            #Predict the response for test dataset
            y_pred = knn.predict(x_test)
            #print(y_pred)
            # Model Accuracy, how often is the classifier correct?
            print("Accuracy in loop:", metrics.accuracy_score(y_test, y_pred))
            c = 0
            for idx in test_index:
                pred_rpknn[idx] = y_pred[c]
                c +=1
        print("Accuracy:",metrics.accuracy_score(data_set.y, pred_rpknn))
        print(pred_rpknn, data_set.y.reshape(1,-1))
    
    
    Accuracy in loop: 1.0
    Accuracy in loop: 0.8571428571428571
    Accuracy in loop: 0.8571428571428571
    Accuracy in loop: 1.0
    Accuracy in loop: 0.9
    Accuracy in loop: 0.9
    Accuracy in loop: 0.95
    Accuracy in loop: 1.0
    Accuracy in loop: 0.9
    Accuracy in loop: 1.0
    Accuracy: 0.9359605911330049