Search code examples
pythonnlpnltktokenizesentiment-analysis

Issue with tokenizing words with NLTK in Python. Returning lists of single letters instead of words


I'm having some trouble with my NLP python program, I am trying to create a dataset of positive and negative tweets however when I run the code it only returns what appears to be tokenized individual letters. I am new to Python and NLP so I apologise if this is basic or if I'm explaining myself poorly. I have added my code below:

import csv
import random
import re
import string
import mysql.connector
from nltk import FreqDist, classify, NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize


def remove_noise(tweet_tokens, stop_words=()):
    cleaned_tokens = []
    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|' \
                  '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
        token = re.sub("(@[A-Za-z0-9_]+)", "", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    print(token)
    return cleaned_tokens


def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token


def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)


if __name__ == "__main__":


with open('positive_tweets.csv') as csv_file:
    positive_tweets = csv.reader(csv_file, delimiter=',')
with open('negative_tweets.csv') as csv_file:
    negative_tweets = csv.reader(csv_file, delimiter=',')

stop_words = stopwords.words('english')

positive_tweet_tokens = word_tokenize(positive_tweets)
negative_tweet_tokens = word_tokenize(negative_tweets)

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words(positive_cleaned_tokens_list)
all_neg_words = get_all_words(negative_cleaned_tokens_list)

freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)
print(freq_dist_pos.most_common(10))
print(freq_dist_neg.most_common(10))

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, 'positive')
                    for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, 'negative')
                    for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

snippet from CSV file for reference:

    "tweetid","username","created_at","tweet","location","place","classification"
"1285666943073161216","MeFixerr","2020-07-21 20:04:20+00:00","Overwhelmed by all the calls, msgs and tweets. I apologize for getting lost without prior notice. Did not expect to be missed with such fervor. 
I am good & taking a break. Lots of love and dua's for everyone of you in #PTIFamily ❤","Pakistan, Quetta",,"positive"

Solution

  • Your tokens are from the file name ('positive_tweets.csv'), not the data inside the file. Add a print statement like below. You will see the issue.

    positive_tweet_tokens = word_tokenize(positive_tweets)
    negative_tweet_tokens = word_tokenize(negative_tweets)
    print("tokens=", positive_tweet_tokens)  # add this line
    

    Output from full script

    tokens= ['positive_tweets.csv']
    v
    v
    [('e', 3), ('v', 2), ('p', 1), ('w', 1), ('c', 1)]
    [('e', 4), ('v', 2), ('n', 1), ('g', 1), ('w', 1), ('c', 1)]
    Accuracy is: 0
    

    Concerning the second error, replace this

    with open('positive_tweets.csv') as csv_file:
        positive_tweets = csv.reader(csv_file, delimiter=',')
    with open('negative_tweets.csv') as csv_file:
        negative_tweets = csv.reader(csv_file, delimiter=',')
    

    with this

    positive_tweets = negative_tweets = ""
    
    with open('positive_tweets.csv') as csv_file:
        positive_tweets_rdr = csv.reader(csv_file, delimiter=',')
        all = list(positive_tweets_rdr)
        for lst in all[1:]: positive_tweets += ' ' + lst[3] #tweet column
        
    with open('negative_tweets.csv') as csv_file:
        negative_tweets_rdr = csv.reader(csv_file, delimiter=',')
        all = list(negative_tweets_rdr)
        for lst in all[1:]: negative_tweets += ' ' + lst[3] #tweet column