Search code examples
pythonsentiment-analysisnaivebayes

Sentiment Analysis, Naive Bayes Accuracy


I'm trying to form a Naive Bayes Classifier script for sentiment classification of tweets. I'm pasting my whole code here, because I know I will get hell if I don't. So I basically I use NLTK's corpuses as training data, and then some tweets I scraped as test data. I pre-process them and do a bag of words extraction. The classifier is trained with no problem and when I do the following

print(classifier.classify(bag_of_words('This is magnificent')))  

it correctly outputs 'pos'.

Now my problem is how to calculate accuracy using ntlk.util accuracy. I do

print(nltk.classify.accuracy(classifier, proc_set))

and I get the following error:

  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-   packages/nltk/classify/util.py", line 87, in accuracy
  results = classifier.classify_many([fs for (fs, l) in gold])
  AttributeError: 'NaiveBayesClassifier' object has no attribute 'classify_many'

I also tried this

test_set_final=[]
for tweet in proc_test:
test_set_final.append((bag_of_words(tweet),   classifier.classify(bag_of_words(tweet))))

print(nltk.classify.accuracy(classifier, test_set_final))

and I get the same kind of error

print(nltk.classify.accuracy(classifier, test_set_final))
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/nltk/classify/util.py", line 87, in accuracy
results = classifier.classify_many([fs for (fs, l) in gold])
AttributeError: 'NaiveBayesClassifier' object has no attribute 'classify_many'

I am 100% I am missing something extremely obvious for Machine Learners.I think t But it's been 3 days and I'm slowly losing my mind, so any help will be appreciated.

Code ->

import nltk
import ast
import string
import re
import csv
import textblob
import pandas as pd
import numpy as np
import itertools
from textblob import TextBlob
from textblob import Word
from textblob.classifiers import NaiveBayesClassifier
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from random import shuffle
from nltk.classify.util import accuracy
from autocorrect import spell

stopwords = stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer().lemmatize
punct=['"','$','%','&','\',''','(',')','+',',','-     ','.','/',':',';','<','=','>','@','[','\',','^','_','`','{','|','}','~']

emoticons_happy = set([
':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
':^)', ':-D', ':D', ': D','8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
'=-3', '=3', ':-))', ':-)', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
'<3',':*', ':p'
])

emoticons_sad = set([
':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
':-[', ':-<', '=\\', '=/', '>:(', ':-(', '>.<', ":'-(", ":'(", ':\\', ':-c',
':c', ':{', '>:\\', ';('
])
emoticons = emoticons_happy.union(emoticons_sad)


def pre_process(tweet):

    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    tweet = re.sub(r'#', '', tweet)

    tweet=''.join([i for i in tweet if not i.isdigit()])

    tweet=re.sub(r'([.,/#!$%^&*;:{}=_`~-])([.,/#!$%^&*;:{}=_`~-]+)\1+', r'\1',tweet)

    tweet = re.sub(r'@[A-Za-z0-9]+', '', tweet)

    tweet=''.join([i for i in tweet if i not in emoticons])

    tweet=''.join([i for i in tweet if i not in punct])

    tweet=' '.join([i for i in tweet.split() if i not in stopwords])

    tweet=tweet.lower()

    tweet=lemmatize(tweet)

    return tweet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wd.ADJ
    elif treebank_tag.startswith('V'):
        return wd.VERB
    elif treebank_tag.startswith('N'):
        return wd.NOUN
    elif treebank_tag.startswith('R'):
        return wd.ADV
    else:
        return wd.NOUN

def lemmatize(tt):
    pos = nltk.pos_tag(nltk.word_tokenize(tt))
    lemm = [lemmatizer(sw[0], get_wordnet_pos(sw[1])) for sw in pos]
    sentence= ' '.join([i for i in lemm])

    return sentence


test_tweets=[]
file=open('scraped_tweets.csv', 'r')
reader = csv.reader(file)
for line in reader:
    line=line[1]
    test_tweets.append(line)

pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')



proc_train_pos=[]
for tweet in pos_tweets:
    proc_train_pos.append(pre_process(tweet))
proc_train_neg=[]
for tweet in neg_tweets:
    proc_train_neg.append(pre_process(tweet))
proc_test=[]
for tweet in test_tweets:
    proc_test.append(pre_process(tweet))


def bag_of_words(tweet):
    words_dictionary = dict([word, True] for word in tweet.split())    
    return words_dictionary

pos_tweets_set = []
for tweet in proc_train_pos:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))    

neg_tweets_set = []
for tweet in proc_train_neg:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))

shuffle(pos_tweets_set)
shuffle(neg_tweets_set)
train_set = pos_tweets_set+neg_tweets_set

classifier = NaiveBayesClassifier(train_set)
print('Training is done')

#print(classifier.classify(bag_of_words('This is magnificent'))) #output 'pos'

print(nltk.classify.accuracy(classifier, proc_set))

Solution

  • Well, as the error message says, the classifier you are trying to use (NaiveBayesClassifier) doesn't have the method classify_many that the nltk.classify.util.accuracy function requires.

    (Reference: https://www.nltk.org/_modules/nltk/classify/naivebayes.html)

    Now, that looks like an NLTK bug, but you can get your answer easily on your own:

    from sklearn.metrics import accuracy_score
    
    y_predicted = [classifier.classify(x) for x in proc_set]
    
    accuracy = accuracy_score(y_true, y_predicted)
    

    Where y_true are the sentiment values corresponding to proc_set inputs (which I don't see you actually creating in your code shown above, though).

    Hope that helps.

    EDIT:

    Or, without using the sklearn accuracy function, but pure Python:

    hits = [yp == yt for yp, yt in zip(y_predicted, y_true)]
    
    accuracy = sum(hits)/len(hits)