Search code examples
pythonclassificationnaivebayes

Naive Bayes Classifier from scratch


Recently have found the below code for GaussianNaiveBayes Classifier.

 import numpy as np
 class GaussianNaiveBayes:
     def fit(self, X, y):
         n_samples, n_features = X.shape
         self._classes = np.unique(y)
         n_classes = len(self._classes)
         self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
         self._var = np.zeros((n_classes, n_features), dtype=np.float64)
         self._priors =  np.zeros(n_classes, dtype=np.float64)

         # calculating the mean, variance and prior P(H) for each class
         for i, c in enumerate(self._classes):
             X_for_class_c = X[y==c]
             self._mean[i, :] = X_for_class_c.mean(axis=0)
             self._var[i, :] = X_for_class_c.var(axis=0)
             self._priors[i] = X_for_class_c.shape[0] / float(n_samples)
#function for calculating the likelihood, P(E|H), of data X given the mean and variance
def _calculate_likelihood(self, class_idx, x):
         mean = self._mean[class_idx]
         var = self._var[class_idx]
         num = np.exp(- (x-mean)**2 / (2 * var))
         denom = np.sqrt(2 * np.pi * var)
         return num / denom 

#classifications by calculating the posterior probability, P(H|E), of the classes 
def predict(self, X):
         y_pred = [self._classify_sample(x) for x in X]
         return np.array(y_pred)

     def _classify_sample(self, x):
         posteriors = []
         # calculating posterior probability for each class
         for i, c in enumerate(self._classes):
             prior = np.log(self._priors[i])
             posterior = np.sum(np.log(self._calculate_likelihood(i, x)))
             posterior = prior + posterior
             posteriors.append(posterior)
         # return the class with highest posterior probability
         return self._classes[np.argmax(posteriors)] 

Tried the above code on Iris dataset by following code but receiving an error "AttributeError: 'GaussianNaiveBayes' object has no attribute 'predict'"

iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = iris.feature_names)
y = pd.DataFrame(iris.target, columns = ['Target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 42)

nb = GaussianNaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

Request any guidance to show my mistake.


Solution

  • You need to indent the code properly and also this line of subsetting the X array will not work when y is a data frame:

    X_for_class_c = X[y==c]
    

    Likewise this function will not work for a dataframe:

    y_pred = [self._classify_sample(x) for x in X]
    

    So let's indent it properly:

    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn import datasets
    
    class GaussianNaiveBayes:
        def fit(self, X, y):
            n_samples, n_features = X.shape
            self._classes = np.unique(y)
            n_classes = len(self._classes)
            self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
            self._var = np.zeros((n_classes, n_features), dtype=np.float64)
            self._priors =  np.zeros(n_classes, dtype=np.float64)
    
            for i, c in enumerate(self._classes):
                X_for_class_c = X[y==c]
                self._mean[i, :] = X_for_class_c.mean(axis=0)
                self._var[i, :] = X_for_class_c.var(axis=0)
                self._priors[i] = X_for_class_c.shape[0] / float(n_samples)
    
        def _calculate_likelihood(self, class_idx, x):
            mean = self._mean[class_idx]
            var = self._var[class_idx]
            num = np.exp(- (x-mean)**2 / (2 * var))
            denom = np.sqrt(2 * np.pi * var)
            return num / denom 
     
        def predict(self, X):
            y_pred = [self._classify_sample(x) for x in X]
            return np.array(y_pred)
    
        def _classify_sample(self, x):
            posteriors = []
             
            for i, c in enumerate(self._classes):
                prior = np.log(self._priors[i])
                posterior = np.sum(np.log(self._calculate_likelihood(i, x)))
                posterior = prior + posterior
                posteriors.append(posterior)
             
            return self._classes[np.argmax(posteriors)]
    

    Run the fit first with your example, you can see the fit returns nan for all your values:

    iris = datasets.load_iris()
    X = pd.DataFrame(iris.data, columns = iris.feature_names)
    y = pd.DataFrame(iris.target, columns = ['Target'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 42)
    
    nb = GaussianNaiveBayes()
    nb.fit(X_train, y_train)
    
    nb._mean
    
    array([[nan, nan, nan, nan],
           [nan, nan, nan, nan],
           [nan, nan, nan, nan]])
    

    Change the input:

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 42)
        
    nb = GaussianNaiveBayes()
    nb.fit(X_train, y_train)
    nb.predict(X_test)
    
    array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
           0, 2, 2, 2, 2, 2, 0, 0])