I am trying to implement a Naive Bayes with NLTK.
When I print out the most informative features, some of them are assigned "NONE". Why is that?
I am using the bag of words model: When I output the features, every feature is assigned true.
Whare does the NONE come from?
I read that
The feature value 'None' is reserved for unseen feature values;
here: http://www.nltk.org/_modules/nltk/classify/naivebayes.html
What does that mean?
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import nltk.data
from nltk.corpus import stopwords
import collections
from nltk.classify.util import accuracy
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
import nltk.metrics
def bag_of_words(words):
return dict([(word, True) for word in words])
def bag_of_words_not_in_set(words, badwords):
return bag_of_words(set(words) - set(badwords))
def bag_of_words_without_stopwords(words):
badwords = stopwords.words("german")
return bag_of_words_not_in_set(words, badwords)
def label_feats_from_corpus(corp, feature_detector=bag_of_words_without_stopwords):
label_feats = collections.defaultdict(list)
for label in corp.categories():
for fileid in corp.fileids(categories=[label]):
feats = feature_detector(corp.words(fileids=[fileid]))
label_feats[label].append(feats)
return label_feats
def split_label_feats(lfeats, split=0.75):
train_feats = []
test_feats = []
for label, feats in lfeats.items():
cutoff = int(len(feats) * split)
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
return train_feats, test_feats
reader = CategorizedPlaintextCorpusReader('D:/corpus/', r'.*\.txt', cat_pattern=r'(\w+)/*')
all_words = nltk.FreqDist(w.lower() for w in reader.words())
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
bigrams = bigram_word_feats(reader.words());
lfeats = label_feats_from_corpus(reader)
train_feats, test_feats = split_label_feats(lfeats, split=0.75)
len(train_feats)
nb_classifier = NaiveBayesClassifier.train(train_feats)
print("------------------------")
acc = accuracy(nb_classifier, test_feats)
print(acc)
print("------------------------")
feats = nb_classifier.most_informative_features(n=25)
for feat in feats:
print(feat) # some are NONE
print("------------------------")
nb_classifier.show_most_informative_features(n=25) # some are NONE
I think the full docstring for the NaiveBayesClassifier
class explains:
If the classifier encounters an input with a feature that has never been seen with any label, then rather than assigning a probability of 0 to all labels, it will ignore that feature.
The feature value 'None' is reserved for unseen feature values; you generally should not use 'None' as a feature value for one of your own features.
If your data contain a feature that was never associated with a label, the value of that feature will be None
. Suppose you train a classifier with features W
, X
, and then classify something with features W
, X
, Z
. The value None
will be used for feature Z
because that feature was never seen in training.
Further Explanation:
Seeing None
does not surprise me because language data are sparse. In a corpus of movie reviews, there will be words that only appear in 1 or 2 documents. For example, an actor's name or word from the title might only appear in 1 review.
Removing frequent (stop) and infrequent words from a corpus prior to analysis is common. For their topic model of Science, Blei and Lafferty (2007) write: "The total vocabulary size in this collection is 375,144 terms. We trim the 356,195 terms that occurred fewer than 70 times as well as 296 stop words."