Search code examples
pythonnlpclassificationnltk

How to train Naive Bayes Classifier for n-gram (movie_reviews)


Below is the code of training Naive Bayes Classifier on movie_reviews dataset for unigram model. I want to train and analyze its performance by considering bigram, trigram model. How can we do it.

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def create_word_features(words):
    useful_words = [word for word in words if word not in stopwords.words("english")] 
    my_dict = dict([(word, True) for word in useful_words])
    return my_dict

pos_data = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_data.append((create_word_features(words), "positive"))    

neg_data = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_data.append((create_word_features(words), "negative")) 

train_set = pos_data[:800] + neg_data[:800]
test_set =  pos_data[800:] + neg_data[800:]

classifier = NaiveBayesClassifier.train(train_set)

accuracy = nltk.classify.util.accuracy(classifier, test_set)

Solution

  • Simply change your featurizer

    from nltk import ngrams
    
    def create_ngram_features(words, n=2):
        ngram_vocab = ngrams(words, n)
        my_dict = dict([(ng, True) for ng in ngram_vocab])
        return my_dict
    

    BTW, your code will be a lot faster if you change your featurizer to do use a set for your stopword list and initialize it only once.

    stoplist = set(stopwords.words("english"))
    
    def create_word_features(words):
        useful_words = [word for word in words if word not in stoplist] 
        my_dict = dict([(word, True) for word in useful_words])
        return my_dict
    

    Someone should really tell the NLTK people to convert the stopwords list into a set type since it's "technically" a unique list (i.e. a set).

    >>> from nltk.corpus import stopwords
    >>> type(stopwords.words('english'))
    <class 'list'>
    >>> type(set(stopwords.words('english')))
    <class 'set'>
    

    For the fun of benchmarking

    import nltk.classify.util
    from nltk.classify import NaiveBayesClassifier
    from nltk.corpus import movie_reviews
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk import ngrams
    
    def create_ngram_features(words, n=2):
        ngram_vocab = ngrams(words, n)
        my_dict = dict([(ng, True) for ng in ngram_vocab])
        return my_dict
    
    for n in [1,2,3,4,5]:
        pos_data = []
        for fileid in movie_reviews.fileids('pos'):
            words = movie_reviews.words(fileid)
            pos_data.append((create_ngram_features(words, n), "positive"))    
    
        neg_data = []
        for fileid in movie_reviews.fileids('neg'):
            words = movie_reviews.words(fileid)
            neg_data.append((create_ngram_features(words, n), "negative")) 
    
        train_set = pos_data[:800] + neg_data[:800]
        test_set =  pos_data[800:] + neg_data[800:]
    
        classifier = NaiveBayesClassifier.train(train_set)
    
        accuracy = nltk.classify.util.accuracy(classifier, test_set)
        print(str(n)+'-gram accuracy:', accuracy)
    

    [out]:

    1-gram accuracy: 0.735
    2-gram accuracy: 0.7625
    3-gram accuracy: 0.8275
    4-gram accuracy: 0.8125
    5-gram accuracy: 0.74
    

    Your original code returns an accuracy of 0.725.

    Use more orders of ngrams

    import nltk.classify.util
    from nltk.classify import NaiveBayesClassifier
    from nltk.corpus import movie_reviews
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk import everygrams
    
    def create_ngram_features(words, n=2):
        ngram_vocab = everygrams(words, 1, n)
        my_dict = dict([(ng, True) for ng in ngram_vocab])
        return my_dict
    
    for n in range(1,6):
        pos_data = []
        for fileid in movie_reviews.fileids('pos'):
            words = movie_reviews.words(fileid)
            pos_data.append((create_ngram_features(words, n), "positive"))    
    
        neg_data = []
        for fileid in movie_reviews.fileids('neg'):
            words = movie_reviews.words(fileid)
            neg_data.append((create_ngram_features(words, n), "negative")) 
    
        train_set = pos_data[:800] + neg_data[:800]
        test_set =  pos_data[800:] + neg_data[800:]
        classifier = NaiveBayesClassifier.train(train_set)
    
        accuracy = nltk.classify.util.accuracy(classifier, test_set)
        print('1-gram to', str(n)+'-gram accuracy:', accuracy)
    

    [out]:

    1-gram to 1-gram accuracy: 0.735
    1-gram to 2-gram accuracy: 0.7625
    1-gram to 3-gram accuracy: 0.7875
    1-gram to 4-gram accuracy: 0.8
    1-gram to 5-gram accuracy: 0.82