I am trying to implement the Maxent Classifier but I am facing problem while using the iis algorithm.The following code works fine for gis algorithm.
import nltk
from nltk.classify import MaxentClassifier, accuracy
from featx import split_label_feats, label_feats_from_corpus
from nltk.corpus import movie_reviews
from nltk.classify import megam
from openpyxl import load_workbook
from featx import bag_of_non_words
from nltk.tokenize import word_tokenize
movie_reviews.categories()
lfeats = label_feats_from_corpus(movie_reviews)
lfeats.keys()
train_feats, test_feats = split_label_feats(lfeats)
me_classifier = nltk.MaxentClassifier.train(train_feats, algorithm='iis', trace=0, max_iter=3)
print accuracy(me_classifier, test_feats)
I am working on a WIN32 machine and the above code is from NLTK book by Jacob Perkins. The warning thrown by it is
C:\Python27\lib\site-packages\nltk\classify\maxent.py:1308: RuntimeWarning: invalid value encountered in multiply
sum1 = numpy.sum(exp_nf_delta * A, axis=0)
C:\Python27\lib\site-packages\nltk\classify\maxent.py:1309: RuntimeWarning: invalid value encountered in multiply
sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
C:\Python27\lib\site-packages\nltk\classify\maxent.py:1315: RuntimeWarning: invalid value encountered in divide
deltas -= (ffreq_empirical - sum1) / -sum2
And then the computer hangs.So I have to stop the execution.
.
Firstly, the way you're importing your libraries unsorted is too confusing. Also there are lot of unused imports. After some googling, So let's cut down the imports and stick with this:
from collections import defaultdict
import nltk
from nltk.classify import MaxentClassifier, accuracy
from nltk.corpus import movie_reviews
Then I found that featx
is some example module the Jacob Perkins was using for his book, this is a better source (https://github.com/sophist114/Python/blob/master/EmotionAnalysis.py). So let's here's a documented version with some explanation of what the functions are doing:
def bag_of_words(words):
"""
Change a document into a BOW feature vector represented by a dict object.
"""
return dict([(word, True) for word in words])
def label_feats_from_corpus(corp, feature_detector=bag_of_words):
"""
Change the corpus into a feature matrix. Sometimes the proceess is
known as vectorization. The default is the use BOW features.
"""
label_feats = defaultdict(list)
for label in corp.categories():
for fileid in corp.fileids(categories=[label]):
feats = feature_detector(corp.words(fileids=[fileid]))
label_feats[label].append(feats)
return label_feats
def split_label_feats(lfeats, split=0.75):
"""
Splits corpus into train and test portion.
This module is used after using `label_feats_from_corpus`.
"""
train_feats = []
test_feats = []
for label, feats in lfeats.iteritems():
cutoff = int(len(feats) * split)
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
return train_feats, test_feats
Now let's go through the process of training the model and testing it, first, the feature extraction:
# Extract features from corpus and for each document label it with the appropriate labels.
label_feats = label_feats_from_corpus(movie_reviews)
let's see what we get after calling label_feats_from_corpus
:
for label in label_feats:
for document in label_feats[label]:
print label, document
break
break
[out]:
neg {u'all': True, u'concept': True, u'skip': True, u'go': True, u'seemed': True, u'suits': True, u'presents': True, u'to': True, u'sitting': True, u'very': True, u'horror': True, u'continues': True, u'every': True, u'exact': True, u'cool': True, u'entire': True, u'did': True, u'dig': True, u'flick': True, u'neighborhood': True, u'crow': True, u'street': True, u'video': True, u'further': True, u'even': True, u'what': True, u'hide': True, u'giving': True, u'new': True, u'ever': True, u'here': True, u'understanding': True, u'entertain': True, u'studio': True, u'others': True, u'kudos': True, u'weird': True, u'makes': True, u'explained': True, u'rarely': True, u'plot': True, u'fed': True, u'disappearances': True, u'from': True, u'would': True, u'&': True, u'two': True, u'music': True, u'films': True, u'themselves': True, u'until': True, u'more': True, u'teen': True, u'clue': True, u'stick': True, u'given': True, u'me': True, u'this': True, u'package': True, u'movies': True, u'making': True, u'my': True, u'give': True, u'fuck': True, u'want': True, u'sense': True, u'!': True, u'holds': True, u'write': True, u'how': True, u'hot': True, u'stir': True, u'okay': True, u'beauty': True, u'mess': True, u'overall': True, u'after': True, u'coming': True, u'such': True, u'guys': True, u'types': True, u'a': True, u'downshifts': True, u'chasing': True, u'redundant': True, u'so': True, u'enter': True, u'playing': True, u'executed': True, u'over': True, u'insight': True, u'years': True, u'still': True, u'its': True, u'before': True, u'thrilling': True, u'somewhere': True, u',': True, u'actually': True, u'meantime': True, u'production': True, u'main': True, u'might': True, u'then': True, u'good': True, u'break': True, u'they': True, u'half': True, u'not': True, u'now': True, u'always': True, u'didn': True, u'arrow': True, u'mean': True, u'bentley': True, u'generation': True, u'idea': True, u'engaging': True, u'happen': True, u'out': True, u"'": True, u'since': True, u'7': True, u'got': True, u'highway': True, u'shows': True, u'blair': True, u'turning': True, u'little': True, u'completely': True, u'shelves': True, u'starts': True, u'terribly': True, u'american': True, u'jumbled': True, u'chopped': True, u'one': True, u'fantasy': True, u'visions': True, u'guess': True, u'"': True, u'2': True, u'too': True, u'wrapped': True, u'final': True, u'slasher': True, u'that': True, u'explanation': True, u'took': True, u'part': True, u'attempt': True, u'10': True, u'kind': True, u'scenes': True, u'feeling': True, u'and': True, u'mind': True, u'sad': True, u'have': True, u'need': True, u'seem': True, u'apparently': True, u'-': True, u'also': True, u'which': True, u'sure': True, u'normal': True, u'who': True, u'most': True, u'don': True, u'drive': True, u'ways': True, u'entertaining': True, u'review': True, u'came': True, u'ending': True, u'find': True, u'touches': True, u'craziness': True, u'(': True, u'should': True, u'only': True, u'going': True, u'pretty': True, u'joblo': True, u'folks': True, u'8': True, u'do': True, u'his': True, u'get': True, u'watch': True, u'feels': True, u'despite': True, u'him': True, u'bad': True, u'where': True, u'lazy': True, u'see': True, u'decided': True, u'are': True, u'sorta': True, u'movie': True, u'nightmare': True, u'3': True, u'unravel': True, u'melissa': True, u'correctly': True, u'flicks': True, u'we': True, u'packaged': True, u'nightmares': True, u'genre': True, u'20': True, u'memento': True, u'both': True, u'accident': True, u's': True, u'witch': True, u'point': True, u'character': True, u'whatever': True, u'tons': True, u'simply': True, u'church': True, u'throughout': True, u'decent': True, u'been': True, u'.': True, u'secret': True, u'life': True, u'kids': True, u'personally': True, u'look': True, u'these': True, u'plain': True, u'harder': True, u'apparitions': True, u'while': True, u'neat': True, u've': True, u'is': True, u'it': True, u'couples': True, u'someone': True, u'in': True, u'chase': True, u'different': True, u')': True, u'things': True, u'make': True, u'same': True, u'member': True, u'strange': True, u'9': True, u'party': True, u'applaud': True, u'drink': True, u'director': True, u'running': True, u'characters': True, u'off': True, u'i': True, u'salvation': True, u'well': True, u'obviously': True, u'edge': True, u'echoes': True, u'the': True, u'away': True, u'just': True, u'generally': True, u'elm': True, u'excites': True, u'seems': True, u'snag': True, u'wes': True, u'4': True, u'has': True, u'big': True, u'showing': True, u'five': True, u'know': True, u'world': True, u'bit': True, u'password': True, u'dreams': True, u'like': True, u'lost': True, u'audience': True, u't': True, u'looooot': True, u'because': True, u'deal': True, u'people': True, u'back': True, u'dead': True, u'unraveling': True, u'critique': True, u'confusing': True, u'for': True, u'bottom': True, u'/': True, u'does': True, u'assuming': True, u'?': True, u'be': True, u'although': True, u'by': True, u'on': True, u'about': True, u'oh': True, u'of': True, u'runtime': True, u'or': True, u'own': True, u'strangeness': True, u'into': True, u'down': True, u'your': True, u'her': True, u'there': True, u'start': True, u'way': True, u'biggest': True, u':': True, u'head': True, u'offering': True, u'but': True, u'taken': True, u'line': True, u'trying': True, u'with': True, u'he': True, u'up': True, u'us': True, u'problem': True, u'minutes': True, u'figured': True, u'doesn': True, u'an': True, u'as': True, u'girlfriend': True, u'mold': True, u'sagemiller': True, u'film': True, u'again': True, u'no': True, u'when': True, u'actors': True, u'you': True, u'really': True, u'dies': True, u'problems': True, u'ago': True}
So we get a document with the neg
label and for each word in our document, we see that ALL words are True. For now each document only contains the feature (i.e. the word) that it has.
Let's move on:
# Let's split the data up into train and test.
train_feats, test_feats = split_label_feats(label_feat)
Now we see that the split_label_feats
change the key value structure such that each iteration of train_feats gives us a document with a tuple of the (features, label)
for features, label in train_documents:
label, features
break
print len(train_documents)
print len(test_documents)
# Get the number of documents in movie_review corpus
num_docs_in_corpus = len(list(chain(*[movie_reviews.fileids(categories=[cat]) for cat in movie_reviews.categories()])))
print len(train_documents) + len(test_documents) == num_docs_in_corpus
[out]:
1500
500
True
So it seems like the error can only be caused by your last two lines of code, when you run the line:
# To train the tagger.
me_classifier = nltk.MaxentClassifier.train(train_documents, algorithm='iis', trace=0, max_iter=3)
You get these warnings but do note that the code is still building the model !!!! So it's just warnings due to underflow, see What are arithmetic underflow and overflow in C?
It takes a while to build the classifier but fear not, just wait till it's finish and don't ctr + c
to end the python process. If you kill the process, you will see this:
Training stopped: keyboard interrupt
So let's understand why the warning occurs, there are 4 warnings given:
/usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1306: RuntimeWarning: overflow encountered in power
exp_nf_delta = 2 ** nf_delta
/usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1308: RuntimeWarning: invalid value encountered in multiply
sum1 = numpy.sum(exp_nf_delta * A, axis=0)
/usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1309: RuntimeWarning: invalid value encountered in multiply
sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
/usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1315: RuntimeWarning: invalid value encountered in divide
deltas -= (ffreq_empirical - sum1) / -sum2
All of them points to the same function used to calculate delta in NLTk's maxent implementation, i.e. https://github.com/nltk/nltk/blob/develop/nltk/classify/maxent.py#L1208 . And you find out that the this delta calculation is specific to IIS (Improved Iterative Scaling) algorithm.
At this point, you need to learn about machine learning and supervised learning, https://en.wikipedia.org/wiki/Supervised_learning
To answer your question, the warming is merely an indication that delta is hard to calculate at some point but it's still reasonable to deal with, possibly because of some super small values when calculating delta. The algorithm IS working. It's not hanging, it's training.
In order to appreciate the neat implementation of MaxEnt in NLTK, i suggest you go through this course https://www.youtube.com/playlist?list=PL6397E4B26D00A269 or for more hardcore Machine Learning course, go to https://www.coursera.org/course/ml
Training a classifier takes time and computing juice and after you wait long enough, you should see that it does:
print accuracy(me_classifier, test_feats)
[out]:
0.5
You can see that the accuracy is bad, as expected since delta calculation is going too far, 0.5 is your baseline. Go through the courses as listed above and you should be able to produce better classifiers after knowing how they come about and how to tune them.
BTW, remember to pickle your classifier so that you don't have to retrain it the next time, see Save Naive Bayes Trained Classifier in NLTK and Pickling a trained classifier yields different results from the results obtained directly from a newly but identically trained classifier
Here's the full code:
from itertools import chain
from collections import defaultdict
import nltk
from nltk.classify import MaxentClassifier, accuracy
from nltk.corpus import movie_reviews
def bag_of_words(words):
"""
Change a document into a BOW feature vector represented by a dict object.
"""
return dict([(word, True) for word in words])
def label_feats_from_corpus(corp, feature_detector=bag_of_words):
"""
Change the corpus into a feature matrix. Sometimes the proceess is
known as vectorization. The default is the use BOW features.
"""
label_feats = defaultdict(list)
for label in corp.categories():
for fileid in corp.fileids(categories=[label]):
feats = feature_detector(corp.words(fileids=[fileid]))
label_feats[label].append(feats)
return label_feats
def split_label_feats(lfeats, split=0.75):
"""
Splits corpus into train and test portion.
This module is used after using `label_feats_from_corpus`.
"""
train_feats = []
test_feats = []
for label, feats in lfeats.iteritems():
cutoff = int(len(feats) * split)
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
return train_feats, test_feats
# Extract features from corpus and for each document label it with the appropriate labels.
label_feats = label_feats_from_corpus(movie_reviews)
'''
for label in label_feats:
for document in label_feats[label]:
print label, document
break
break
'''
# Let's split the data up into train and test.
train_documents, test_documents = split_label_feats(label_feats)
'''
# Now we see that the `split_label_feats` change the key value structure such that each iteration of train_feats gives us a document with a tuple of the (features, label)
for features, label in train_documents:
print label, features
break
print len(train_documents)
print len(test_documents)
# Get the number of documents in movie_review corpus
num_docs_in_corpus = len(list(chain(*[movie_reviews.fileids(categories=[cat]) for cat in movie_reviews.categories()])))
print len(train_documents) + len(test_documents) == num_docs_in_corpus
'''
# To train the tagger.
me_classifier = nltk.MaxentClassifier.train(train_documents, algorithm='iis', trace=0, max_iter=3)
print accuracy(me_classifier, test_feats)