I am trying to create a simple Naive Bayes Classifier
for classifying data among two classes as mentioned in the code below. But I am stuck with the below error, Can anybody tell me what I am doing wrong.
Traceback (most recent call last):
File "NBC.py", line 33, in <module>
test(['Apple', 'Banana'])
File "NBC.py", line 16, in test
prob_dist = classifier.prob_classify(lst)
File "/home/***/.local/lib/python3.6/site-packages/nltk/classify/naivebayes.py", line 95, in prob_classify
for fname in list(featureset.keys()):
AttributeError: 'list' object has no attribute 'keys'
"NBC.py"
from nltk.classify import NaiveBayesClassifier
dataFruits = ['Apple', 'Banana', 'Cherry', 'Grape', 'Guava',
'Lemon', 'Mangos', 'Orange', 'Strawberry', 'Watermelon']
dataVeggies = ['Potato', 'Spinach', 'Carrot', 'Onion', 'Cabbage',
'Barccoli', 'Tomatoe', 'Pea', 'Cucumber', 'Eggplant']
def create_features(word):
my_dict = dict([(word, True)])
return my_dict
def test(words):
lst = [create_features(wd) for wd in words]
prob_dist = classifier.prob_classify(lst)
print(prob_dist.prob('fruit'))
class1= [(create_features(item), 'fruit') for item in dataFruits]
#print(class1)
class2 = [(create_features(item), 'veggie') for item in dataVeggies]
#print(class2)
train_set = class1[:] + class2
print(train_set)
# Train
classifier = NaiveBayesClassifier.train(train_set)
# Predict
test(['Apple', 'Banana'])
What your code is trying to do is to build is a very simple classifier based on name features. Based on its name, an item will be classified as a 'fruit'
or as a 'veggie'
. The training set contains a few names with their respective classes.
The error you're getting is due to the wrong format of your training set and test set. The training set is a list of featuresets (one featureset for each training example) and should have a structure of the form:
training_set = [featureset1, featureset2, ...]
Each featureset is a pair (features, class)
where features
is a dictionary
{'f1': value1, 'f2': value2, ...}
and class
is some value. For instance in your classifier the featureset for 'Apple'
is:
({'Apple': True,
'Banana': False,
'Broccoli': False,
'Cabbage': False,
'Carrot': False,
'Cherry': False,
'Cucumber': False,
'Eggplant': False,
'Grape': False,
'Guava': False,
'Lemon': False,
'Mangos': False,
'Onion': False,
'Orange': False,
'Pea': False,
'Potato': False,
'Spinach': False,
'Strawberry': False,
'Tomato': False,
'Watermelon': False},
'fruit')
Here is the corrected code:
from nltk.classify import NaiveBayesClassifier, accuracy
dataFruits = ['Apple', 'Banana', 'Cherry', 'Grape', 'Guava',
'Lemon', 'Mangos', 'Orange', 'Strawberry', 'Watermelon']
dataVeggies = ['Potato', 'Spinach', 'Carrot', 'Onion', 'Cabbage',
'Broccoli', 'Tomato', 'Pea', 'Cucumber', 'Eggplant']
def create_features(word, featureNames):
my_dict = dict([(w, False) for w in featureNames])
my_dict[word] = True
return my_dict
def test(word):
lst = create_features(word, allFeatures)
prob_dist = classifier.prob_classify(lst)
print('{}'.format(word))
print('Fruit probability: {:.2f}\tVeggie probability: {:.2f}'.format( prob_dist.prob('fruit'), prob_dist.prob('veggie')))
return prob_dist
allFeatures = dataFruits + dataVeggies
class1= [(create_features(item, allFeatures), 'fruit') for item in dataFruits]
class2 = [(create_features(item, allFeatures), 'veggie') for item in dataVeggies]
train_set = class1[:] + class2
test_set = [(create_features(item, allFeatures), 'fruit') for item in ['Apple','Banana']]
# Train
classifier = NaiveBayesClassifier.train(train_set)
# Predict
test('Strawberry')
test('Strawby')
# Accuracy on test set
print('Accuracy on test set: {:.2f}'.format(accuracy(classifier, test_set)))
A slightly better classifier, maybe this is what you were thinking of (along the lines of the example in http://www.nltk.org/book/ch06.html (Document Classification). Here the classifier simply predicts whether a basket contains more fruits or veggies. Based on this you can construct more complex classifiers (with better features and more training data).
from nltk.classify import NaiveBayesClassifier, accuracy
dataFruits = ['Apple', 'Banana', 'Cherry', 'Grape', 'Guava',
'Lemon', 'Mangos', 'Orange', 'Strawberry', 'Watermelon']
dataVeggies = ['Potato', 'Spinach', 'Carrot', 'Onion', 'Cabbage',
'Broccoli', 'Tomato', 'Pea', 'Cucumber', 'Eggplant']
def basket_features(basket):
basket_items = set(basket)
features = {}
for item in allFeatures:
features['contains({})'.format(item)] = (item in basket_items)
return features
def test(basket):
lst = basket_features(basket)
prob_dist = classifier.prob_classify(lst)
print('Basket: {}'.format(basket))
print('Fruit probability: {:.2f}\tVeggie probability: {:.2f}'.format(prob_dist.prob('fruit'), prob_dist.prob('veggie')))
return prob_dist
allFeatures = dataFruits + dataVeggies
class1= [(basket_features([item]), 'fruit') for item in dataFruits]
class2 = [(basket_features([item]), 'veggie') for item in dataVeggies]
train_set = class1[:] + class2
# Train
classifier = NaiveBayesClassifier.train(train_set)
# Predict
test(['Apple', 'Banana', 'Cherry', 'Carrot', 'Eggplant', 'Cabbage','Pea'])
test(['Apple', 'Banana', 'Mangos', 'Carrot', 'Eggplant', 'Cabbage','Pea', 'Cucumber'])
test(['Apple', 'Banana'])
test(['Apple', 'Banana', 'Grape'])
classifier.show_most_informative_features(5)