Search code examples
python-2.7nltkmaxent

My Maxent Classifier works fine with gis algorithm but does not work with iis algorithm. It is not throwing any error, just some warnings


I am trying to implement the Maxent Classifier but I am facing problem while using the iis algorithm.The following code works fine for gis algorithm.

import nltk
from nltk.classify import MaxentClassifier, accuracy
from featx import split_label_feats, label_feats_from_corpus
from nltk.corpus import movie_reviews
from nltk.classify import megam
from openpyxl import load_workbook
from featx import bag_of_non_words  
from nltk.tokenize import word_tokenize
movie_reviews.categories()
lfeats = label_feats_from_corpus(movie_reviews)

lfeats.keys()
train_feats, test_feats = split_label_feats(lfeats)
me_classifier = nltk.MaxentClassifier.train(train_feats, algorithm='iis', trace=0, max_iter=3)
print accuracy(me_classifier, test_feats)

I am working on a WIN32 machine and the above code is from NLTK book by Jacob Perkins. The warning thrown by it is

C:\Python27\lib\site-packages\nltk\classify\maxent.py:1308: RuntimeWarning: invalid value encountered in multiply
  sum1 = numpy.sum(exp_nf_delta * A, axis=0)
C:\Python27\lib\site-packages\nltk\classify\maxent.py:1309: RuntimeWarning: invalid value encountered in multiply
  sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
C:\Python27\lib\site-packages\nltk\classify\maxent.py:1315: RuntimeWarning: invalid value encountered in divide
  deltas -= (ffreq_empirical - sum1) / -sum2

And then the computer hangs.So I have to stop the execution.

.


Solution

  • Firstly, the way you're importing your libraries unsorted is too confusing. Also there are lot of unused imports. After some googling, So let's cut down the imports and stick with this:

    from collections import defaultdict
    
    import nltk
    from nltk.classify import MaxentClassifier, accuracy
    from nltk.corpus import movie_reviews
    

    Then I found that featx is some example module the Jacob Perkins was using for his book, this is a better source (https://github.com/sophist114/Python/blob/master/EmotionAnalysis.py). So let's here's a documented version with some explanation of what the functions are doing:

    def bag_of_words(words):
        """
        Change a document into a BOW feature vector represented by a dict object.
        """
        return dict([(word, True) for word in words])
    
    
    def label_feats_from_corpus(corp, feature_detector=bag_of_words):
        """
        Change the corpus into a feature matrix. Sometimes the proceess is 
        known as vectorization. The default is the use BOW features.
        """
        label_feats = defaultdict(list)
        for label in corp.categories():
            for fileid in corp.fileids(categories=[label]):
                feats = feature_detector(corp.words(fileids=[fileid]))
                label_feats[label].append(feats)
        return label_feats
    
    
    def split_label_feats(lfeats, split=0.75):
        """
        Splits corpus into train and test portion.
        This module is used after using `label_feats_from_corpus`.
        """
        train_feats = []
        test_feats = []
        for label, feats in lfeats.iteritems():
            cutoff = int(len(feats) * split)
            train_feats.extend([(feat, label) for feat in feats[:cutoff]])
            test_feats.extend([(feat, label) for feat in feats[cutoff:]])
        return train_feats, test_feats
    

    Now let's go through the process of training the model and testing it, first, the feature extraction:

    # Extract features from corpus and for each document label it with the appropriate labels. 
    label_feats = label_feats_from_corpus(movie_reviews)
    

    let's see what we get after calling label_feats_from_corpus:

    for label in label_feats:
        for document in label_feats[label]: 
            print label, document
            break
        break
    

    [out]:

    neg {u'all': True, u'concept': True, u'skip': True, u'go': True, u'seemed': True, u'suits': True, u'presents': True, u'to': True, u'sitting': True, u'very': True, u'horror': True, u'continues': True, u'every': True, u'exact': True, u'cool': True, u'entire': True, u'did': True, u'dig': True, u'flick': True, u'neighborhood': True, u'crow': True, u'street': True, u'video': True, u'further': True, u'even': True, u'what': True, u'hide': True, u'giving': True, u'new': True, u'ever': True, u'here': True, u'understanding': True, u'entertain': True, u'studio': True, u'others': True, u'kudos': True, u'weird': True, u'makes': True, u'explained': True, u'rarely': True, u'plot': True, u'fed': True, u'disappearances': True, u'from': True, u'would': True, u'&': True, u'two': True, u'music': True, u'films': True, u'themselves': True, u'until': True, u'more': True, u'teen': True, u'clue': True, u'stick': True, u'given': True, u'me': True, u'this': True, u'package': True, u'movies': True, u'making': True, u'my': True, u'give': True, u'fuck': True, u'want': True, u'sense': True, u'!': True, u'holds': True, u'write': True, u'how': True, u'hot': True, u'stir': True, u'okay': True, u'beauty': True, u'mess': True, u'overall': True, u'after': True, u'coming': True, u'such': True, u'guys': True, u'types': True, u'a': True, u'downshifts': True, u'chasing': True, u'redundant': True, u'so': True, u'enter': True, u'playing': True, u'executed': True, u'over': True, u'insight': True, u'years': True, u'still': True, u'its': True, u'before': True, u'thrilling': True, u'somewhere': True, u',': True, u'actually': True, u'meantime': True, u'production': True, u'main': True, u'might': True, u'then': True, u'good': True, u'break': True, u'they': True, u'half': True, u'not': True, u'now': True, u'always': True, u'didn': True, u'arrow': True, u'mean': True, u'bentley': True, u'generation': True, u'idea': True, u'engaging': True, u'happen': True, u'out': True, u"'": True, u'since': True, u'7': True, u'got': True, u'highway': True, u'shows': True, u'blair': True, u'turning': True, u'little': True, u'completely': True, u'shelves': True, u'starts': True, u'terribly': True, u'american': True, u'jumbled': True, u'chopped': True, u'one': True, u'fantasy': True, u'visions': True, u'guess': True, u'"': True, u'2': True, u'too': True, u'wrapped': True, u'final': True, u'slasher': True, u'that': True, u'explanation': True, u'took': True, u'part': True, u'attempt': True, u'10': True, u'kind': True, u'scenes': True, u'feeling': True, u'and': True, u'mind': True, u'sad': True, u'have': True, u'need': True, u'seem': True, u'apparently': True, u'-': True, u'also': True, u'which': True, u'sure': True, u'normal': True, u'who': True, u'most': True, u'don': True, u'drive': True, u'ways': True, u'entertaining': True, u'review': True, u'came': True, u'ending': True, u'find': True, u'touches': True, u'craziness': True, u'(': True, u'should': True, u'only': True, u'going': True, u'pretty': True, u'joblo': True, u'folks': True, u'8': True, u'do': True, u'his': True, u'get': True, u'watch': True, u'feels': True, u'despite': True, u'him': True, u'bad': True, u'where': True, u'lazy': True, u'see': True, u'decided': True, u'are': True, u'sorta': True, u'movie': True, u'nightmare': True, u'3': True, u'unravel': True, u'melissa': True, u'correctly': True, u'flicks': True, u'we': True, u'packaged': True, u'nightmares': True, u'genre': True, u'20': True, u'memento': True, u'both': True, u'accident': True, u's': True, u'witch': True, u'point': True, u'character': True, u'whatever': True, u'tons': True, u'simply': True, u'church': True, u'throughout': True, u'decent': True, u'been': True, u'.': True, u'secret': True, u'life': True, u'kids': True, u'personally': True, u'look': True, u'these': True, u'plain': True, u'harder': True, u'apparitions': True, u'while': True, u'neat': True, u've': True, u'is': True, u'it': True, u'couples': True, u'someone': True, u'in': True, u'chase': True, u'different': True, u')': True, u'things': True, u'make': True, u'same': True, u'member': True, u'strange': True, u'9': True, u'party': True, u'applaud': True, u'drink': True, u'director': True, u'running': True, u'characters': True, u'off': True, u'i': True, u'salvation': True, u'well': True, u'obviously': True, u'edge': True, u'echoes': True, u'the': True, u'away': True, u'just': True, u'generally': True, u'elm': True, u'excites': True, u'seems': True, u'snag': True, u'wes': True, u'4': True, u'has': True, u'big': True, u'showing': True, u'five': True, u'know': True, u'world': True, u'bit': True, u'password': True, u'dreams': True, u'like': True, u'lost': True, u'audience': True, u't': True, u'looooot': True, u'because': True, u'deal': True, u'people': True, u'back': True, u'dead': True, u'unraveling': True, u'critique': True, u'confusing': True, u'for': True, u'bottom': True, u'/': True, u'does': True, u'assuming': True, u'?': True, u'be': True, u'although': True, u'by': True, u'on': True, u'about': True, u'oh': True, u'of': True, u'runtime': True, u'or': True, u'own': True, u'strangeness': True, u'into': True, u'down': True, u'your': True, u'her': True, u'there': True, u'start': True, u'way': True, u'biggest': True, u':': True, u'head': True, u'offering': True, u'but': True, u'taken': True, u'line': True, u'trying': True, u'with': True, u'he': True, u'up': True, u'us': True, u'problem': True, u'minutes': True, u'figured': True, u'doesn': True, u'an': True, u'as': True, u'girlfriend': True, u'mold': True, u'sagemiller': True, u'film': True, u'again': True, u'no': True, u'when': True, u'actors': True, u'you': True, u'really': True, u'dies': True, u'problems': True, u'ago': True}
    

    So we get a document with the neg label and for each word in our document, we see that ALL words are True. For now each document only contains the feature (i.e. the word) that it has.

    Let's move on:

    # Let's split the data up into train and test.
    train_feats, test_feats = split_label_feats(label_feat) 
    

    Now we see that the split_label_feats change the key value structure such that each iteration of train_feats gives us a document with a tuple of the (features, label)

    for features, label in train_documents:
        label, features
        break
    
    print len(train_documents)
    print len(test_documents)
    # Get the number of documents in movie_review corpus
    num_docs_in_corpus = len(list(chain(*[movie_reviews.fileids(categories=[cat]) for cat in movie_reviews.categories()])))
    print len(train_documents) + len(test_documents) == num_docs_in_corpus
    

    [out]:

    1500
    500
    True
    

    So it seems like the error can only be caused by your last two lines of code, when you run the line:

    # To train the tagger.
    me_classifier = nltk.MaxentClassifier.train(train_documents, algorithm='iis', trace=0, max_iter=3)
    

    You get these warnings but do note that the code is still building the model !!!! So it's just warnings due to underflow, see What are arithmetic underflow and overflow in C?

    It takes a while to build the classifier but fear not, just wait till it's finish and don't ctr + c to end the python process. If you kill the process, you will see this:

    Training stopped: keyboard interrupt
    

    So let's understand why the warning occurs, there are 4 warnings given:

    /usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1306: RuntimeWarning: overflow encountered in power
      exp_nf_delta = 2 ** nf_delta
    /usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1308: RuntimeWarning: invalid value encountered in multiply
      sum1 = numpy.sum(exp_nf_delta * A, axis=0)
    /usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1309: RuntimeWarning: invalid value encountered in multiply
      sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
    /usr/local/lib/python2.7/dist-packages/nltk/classify/maxent.py:1315: RuntimeWarning: invalid value encountered in divide
      deltas -= (ffreq_empirical - sum1) / -sum2
    

    All of them points to the same function used to calculate delta in NLTk's maxent implementation, i.e. https://github.com/nltk/nltk/blob/develop/nltk/classify/maxent.py#L1208 . And you find out that the this delta calculation is specific to IIS (Improved Iterative Scaling) algorithm.

    At this point, you need to learn about machine learning and supervised learning, https://en.wikipedia.org/wiki/Supervised_learning

    To answer your question, the warming is merely an indication that delta is hard to calculate at some point but it's still reasonable to deal with, possibly because of some super small values when calculating delta. The algorithm IS working. It's not hanging, it's training.

    In order to appreciate the neat implementation of MaxEnt in NLTK, i suggest you go through this course https://www.youtube.com/playlist?list=PL6397E4B26D00A269 or for more hardcore Machine Learning course, go to https://www.coursera.org/course/ml

    Training a classifier takes time and computing juice and after you wait long enough, you should see that it does:

    print accuracy(me_classifier, test_feats)
    

    [out]:

    0.5
    

    You can see that the accuracy is bad, as expected since delta calculation is going too far, 0.5 is your baseline. Go through the courses as listed above and you should be able to produce better classifiers after knowing how they come about and how to tune them.

    BTW, remember to pickle your classifier so that you don't have to retrain it the next time, see Save Naive Bayes Trained Classifier in NLTK and Pickling a trained classifier yields different results from the results obtained directly from a newly but identically trained classifier

    Here's the full code:

    from itertools import chain
    from collections import defaultdict
    
    import nltk
    from nltk.classify import MaxentClassifier, accuracy
    from nltk.corpus import movie_reviews
    
    def bag_of_words(words):
        """
        Change a document into a BOW feature vector represented by a dict object.
        """
        return dict([(word, True) for word in words])
    
    
    def label_feats_from_corpus(corp, feature_detector=bag_of_words):
        """
        Change the corpus into a feature matrix. Sometimes the proceess is 
        known as vectorization. The default is the use BOW features.
        """
        label_feats = defaultdict(list)
        for label in corp.categories():
            for fileid in corp.fileids(categories=[label]):
                feats = feature_detector(corp.words(fileids=[fileid]))
                label_feats[label].append(feats)
        return label_feats
    
    
    def split_label_feats(lfeats, split=0.75):
        """
        Splits corpus into train and test portion.
        This module is used after using `label_feats_from_corpus`.
        """
        train_feats = []
        test_feats = []
        for label, feats in lfeats.iteritems():
            cutoff = int(len(feats) * split)
            train_feats.extend([(feat, label) for feat in feats[:cutoff]])
            test_feats.extend([(feat, label) for feat in feats[cutoff:]])
        return train_feats, test_feats
    
    
    # Extract features from corpus and for each document label it with the appropriate labels. 
    label_feats = label_feats_from_corpus(movie_reviews)
    '''
    for label in label_feats:
        for document in label_feats[label]: 
            print label, document
            break
        break
    '''
    
    # Let's split the data up into train and test.
    train_documents, test_documents = split_label_feats(label_feats) 
    '''
    # Now we see that the `split_label_feats` change the key value structure such that each iteration of train_feats gives us a document with a tuple of the (features, label)
    for features, label in train_documents:
        print label, features
        break
    
    print len(train_documents)
    print len(test_documents)
    # Get the number of documents in movie_review corpus
    num_docs_in_corpus = len(list(chain(*[movie_reviews.fileids(categories=[cat]) for cat in movie_reviews.categories()])))
    print len(train_documents) + len(test_documents) == num_docs_in_corpus
    '''
    
    # To train the tagger.
    me_classifier = nltk.MaxentClassifier.train(train_documents, algorithm='iis', trace=0, max_iter=3)
    print accuracy(me_classifier, test_feats)