Search code examples
pythonmachine-learningscikit-learnclassificationnaivebayes

Python Bayes heart prediction, results are not accurate


I'm trying to make a heart disease prediction program using Naive Bayes. When I finished the classifier, the cross validation showed a mean accuracy of 80% However when I try to make a prediction on a given sample, the prediction is all wrong! The dataset is the heart disease dataset from UCI repository, it contains 303 samples. There are two classes 0: healthy and 1: ill, when I try making a prediction on a sample from the dataset, it doesn't predicts its true value, except for very few samples. Here is the code:

import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import Imputer, StandardScaler


class Predict:
    def Read_Clean(self,dataset):
        header_row = ['Age', 'Gender', 'Chest_Pain', 'Resting_Blood_Pressure', 'Serum_Cholestrol',
                      'Fasting_Blood_Sugar', 'Resting_ECG', 'Max_Heart_Rate',
                      'Exercise_Induced_Angina', 'OldPeak',
                      'Slope', 'CA', 'Thal', 'Num']
        df = pd.read_csv(dataset, names=header_row)
        df = df.replace('[?]', np.nan, regex=True)
        df = pd.DataFrame(Imputer(missing_values='NaN', strategy='mean', axis=0)
                          .fit_transform(df), columns=header_row)
        df = df.astype(float)
        return df

    def Train_Test_Split_data(self,dataset):
        Y = dataset['Num'].apply(lambda x: 1 if x > 0 else 0)
        X = dataset.drop('Num', axis=1)
        validation_size = 0.20
        seed = 42
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)
        return X_train, X_test, Y_train, Y_test

    def Scaler(self, X_train, X_test):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        return X_train, X_test

    def Cross_Validate(self, clf, X_train, Y_train, cv=5):
        scores = cross_val_score(clf, X_train, Y_train, cv=cv, scoring='f1')
        score = scores.mean()
        print("CV scores mean: %.4f " % (score))
        return score, scores

    def Fit_Score(self, clf, X_train, Y_train, X_test, Y_test, label='x'):
        clf.fit(X_train, Y_train)
        fit_score = clf.score(X_train, Y_train)
        pred_score = clf.score(X_test, Y_test)
        print("%s: fit score %.5f, predict score %.5f" % (label, fit_score, pred_score))
        return pred_score

    def ReturnPredictionValue(self, clf, sample):
        y = clf.predict([sample])
        return y[0]

    def PredictionMain(self, sample, dataset_path='dataset/processed.cleveland.data'):
        data = self.Read_Clean(dataset_path)
        X_train, X_test, Y_train, Y_test = self.Train_Test_Split_data(data)
        X_train, X_test = self.Scaler(X_train, X_test)
        self.NB = GaussianNB()
        self.Fit_Score(self.NB, X_train, Y_train, X_test, Y_test, label='NB')
        self.Cross_Validate(self.NB, X_train, Y_train, 10)
        return self.ReturnPredictionValue(self.NB, sample)

When I run:

if __name__ == '__main__':
sample = [41.0, 0.0, 2.0, 130.0, 204.0, 0.0, 2.0, 172.0, 0.0, 1.4, 1.0, 0.0, 3.0]
p = Predict()
print "Prediction value: {}".format(p.PredictionMain(sample))

The result is:

NB: fit score 0.84711, predict score 0.83607 CV scores mean: 0.8000

Prediction value: 1

I get 1 instead of 0 (this sample is already one of the dataset samples). I did this for more than one sample from the dataset and I get wrong result most of the time, it's as if the accuracy is not 80%!

Any help would be appreciated. Thanks in advance.


Edit: Problem solved using Pipeline. The final code is:

import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

class Predict:
    def __init__(self):
        self.X = []
        self.Y = []

    def Read_Clean(self,dataset):
        header_row = ['Age', 'Gender', 'Chest_Pain', 'Resting_Blood_Pressure', 'Serum_Cholestrol',
                      'Fasting_Blood_Sugar', 'Resting_ECG', 'Max_Heart_Rate',
                      'Exercise_Induced_Angina', 'OldPeak',
                      'Slope', 'CA', 'Thal', 'Num']
        df = pd.read_csv(dataset, names=header_row)
        df = df.replace('[?]', np.nan, regex=True)
        df = pd.DataFrame(Imputer(missing_values='NaN', strategy='mean', axis=0)
                          .fit_transform(df), columns=header_row)
        df = df.astype(float)
        return df

    def Split_Dataset(self, df):
        self.Y = df['Num'].apply(lambda x: 1 if x > 0 else 0)
        self.X = df.drop('Num', axis=1)

    def Create_Pipeline(self):
        estimators = []
        estimators.append(('standardize', StandardScaler()))
        estimators.append(('bayes', GaussianNB()))
        model = Pipeline(estimators)
        return model

    def Cross_Validate(self, clf, cv=5):
        scores = cross_val_score(clf, self.X, self.Y, cv=cv, scoring='f1')
        score = scores.mean()
        print("CV scores mean: %.4f " % (score))

    def Fit_Score(self, clf, label='x'):
        clf.fit(self.X, self.Y)
        fit_score = clf.score(self.X, self.Y)
        print("%s: fit score %.5f" % (label, fit_score))

    def ReturnPredictionValue(self, clf, sample):
        y = clf.predict([sample])
        return y[0]

    def PredictionMain(self, sample, dataset_path='dataset/processed.cleveland.data'):
        print "dataset: "+ dataset_path
        data = self.Read_Clean(dataset_path)
        self.Split_Dataset(data)
        self.model = self.Create_Pipeline()
        self.Fit_Score(self.model, label='NB')
        self.Cross_Validate(self.model, 10)
        return self.ReturnPredictionValue(self.model, sample)

Now making a prediction on the same sample in the question returns [0] which is the true value. Actually by running the following method:

def CheckTrue(self):
    clf = self.Create_Pipeline()
    out = cross_val_predict(clf, self.X, self.Y)
    p = [out == self.Y]
    c = 0
    for i in range(303):
        if p[0][i] == True:
            c += 1
    print "Samples with true values: {}".format(c)

I get 249 true samples using the pipeline code, whereas I got only 150 before.


Solution

  • You're not applying StandardScaler to the sample. Classifier expects scaled data as it was trained on StandardScaler.transform output, but sample is not scaled the same way as in training.

    It is easy to make such mistakes when combining multiple steps (scaling, preprocessing, classification) manually. To avoid such issues it is a good idea to use scikit-learn Pipeline.