I'm trying to form a Naive Bayes Classifier script for sentiment classification of tweets. I'm pasting my whole code here, because I know I will get hell if I don't. So I basically I use NLTK's corpuses as training data, and then some tweets I scraped as test data. I pre-process them and do a bag of words extraction. The classifier is trained with no problem and when I do the following
print(classifier.classify(bag_of_words('This is magnificent')))
it correctly outputs 'pos'.
Now my problem is how to calculate accuracy using ntlk.util accuracy. I do
print(nltk.classify.accuracy(classifier, proc_set))
and I get the following error:
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site- packages/nltk/classify/util.py", line 87, in accuracy
results = classifier.classify_many([fs for (fs, l) in gold])
AttributeError: 'NaiveBayesClassifier' object has no attribute 'classify_many'
I also tried this
test_set_final=[]
for tweet in proc_test:
test_set_final.append((bag_of_words(tweet), classifier.classify(bag_of_words(tweet))))
print(nltk.classify.accuracy(classifier, test_set_final))
and I get the same kind of error
print(nltk.classify.accuracy(classifier, test_set_final))
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/nltk/classify/util.py", line 87, in accuracy
results = classifier.classify_many([fs for (fs, l) in gold])
AttributeError: 'NaiveBayesClassifier' object has no attribute 'classify_many'
I am 100% I am missing something extremely obvious for Machine Learners.I think t But it's been 3 days and I'm slowly losing my mind, so any help will be appreciated.
Code ->
import nltk
import ast
import string
import re
import csv
import textblob
import pandas as pd
import numpy as np
import itertools
from textblob import TextBlob
from textblob import Word
from textblob.classifiers import NaiveBayesClassifier
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from random import shuffle
from nltk.classify.util import accuracy
from autocorrect import spell
stopwords = stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer().lemmatize
punct=['"','$','%','&','\',''','(',')','+',',','- ','.','/',':',';','<','=','>','@','[','\',','^','_','`','{','|','}','~']
emoticons_happy = set([
':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
':^)', ':-D', ':D', ': D','8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
'=-3', '=3', ':-))', ':-)', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
'<3',':*', ':p'
])
emoticons_sad = set([
':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
':-[', ':-<', '=\\', '=/', '>:(', ':-(', '>.<', ":'-(", ":'(", ':\\', ':-c',
':c', ':{', '>:\\', ';('
])
emoticons = emoticons_happy.union(emoticons_sad)
def pre_process(tweet):
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
tweet = re.sub(r'#', '', tweet)
tweet=''.join([i for i in tweet if not i.isdigit()])
tweet=re.sub(r'([.,/#!$%^&*;:{}=_`~-])([.,/#!$%^&*;:{}=_`~-]+)\1+', r'\1',tweet)
tweet = re.sub(r'@[A-Za-z0-9]+', '', tweet)
tweet=''.join([i for i in tweet if i not in emoticons])
tweet=''.join([i for i in tweet if i not in punct])
tweet=' '.join([i for i in tweet.split() if i not in stopwords])
tweet=tweet.lower()
tweet=lemmatize(tweet)
return tweet
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wd.ADJ
elif treebank_tag.startswith('V'):
return wd.VERB
elif treebank_tag.startswith('N'):
return wd.NOUN
elif treebank_tag.startswith('R'):
return wd.ADV
else:
return wd.NOUN
def lemmatize(tt):
pos = nltk.pos_tag(nltk.word_tokenize(tt))
lemm = [lemmatizer(sw[0], get_wordnet_pos(sw[1])) for sw in pos]
sentence= ' '.join([i for i in lemm])
return sentence
test_tweets=[]
file=open('scraped_tweets.csv', 'r')
reader = csv.reader(file)
for line in reader:
line=line[1]
test_tweets.append(line)
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')
proc_train_pos=[]
for tweet in pos_tweets:
proc_train_pos.append(pre_process(tweet))
proc_train_neg=[]
for tweet in neg_tweets:
proc_train_neg.append(pre_process(tweet))
proc_test=[]
for tweet in test_tweets:
proc_test.append(pre_process(tweet))
def bag_of_words(tweet):
words_dictionary = dict([word, True] for word in tweet.split())
return words_dictionary
pos_tweets_set = []
for tweet in proc_train_pos:
pos_tweets_set.append((bag_of_words(tweet), 'pos'))
neg_tweets_set = []
for tweet in proc_train_neg:
neg_tweets_set.append((bag_of_words(tweet), 'neg'))
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)
train_set = pos_tweets_set+neg_tweets_set
classifier = NaiveBayesClassifier(train_set)
print('Training is done')
#print(classifier.classify(bag_of_words('This is magnificent'))) #output 'pos'
print(nltk.classify.accuracy(classifier, proc_set))
Well, as the error message says, the classifier you are trying to use (NaiveBayesClassifier
) doesn't have the method classify_many
that the nltk.classify.util.accuracy
function requires.
(Reference: https://www.nltk.org/_modules/nltk/classify/naivebayes.html)
Now, that looks like an NLTK bug, but you can get your answer easily on your own:
from sklearn.metrics import accuracy_score
y_predicted = [classifier.classify(x) for x in proc_set]
accuracy = accuracy_score(y_true, y_predicted)
Where y_true
are the sentiment values corresponding to proc_set
inputs (which I don't see you actually creating in your code shown above, though).
Hope that helps.
EDIT:
Or, without using the sklearn
accuracy function, but pure Python:
hits = [yp == yt for yp, yt in zip(y_predicted, y_true)]
accuracy = sum(hits)/len(hits)