I have taken a code online to do sentiment analysis on twitter database. I tried running it and it gave me at the beginning error for printing, which I figured out that the newer version of python has changed its way to do print. I am getting error that shows my data is not filled in the array, if anyone has worked with python and has eagle eye to see where I am going wrong please help.
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle
import chardet
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
import gensim
from gensim.models.word2vec import Word2Vec
LabeledSentence = gensim.models.doc2vec.LabeledSentence
import pandas as pd
pd.options.mode.chained_assignment = None
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
def ingest(filename):
with open(filename, 'rb') as f:
result = chardet.detect(f.read())
data = pd.read_csv(filename, encoding=result['encoding'])
data.drop(['ItemID', 'Date', 'Blank', 'SentimentSource'], axis=1, inplace=True)
data = data[data.Sentiment.isnull() == False]
data['Sentiment'] = data['Sentiment'].map({4:1, 0:0})
data = data[data['SentimentText'].isnull() == False]
data.reset_index(inplace=True)
data.drop('index', axis=1, inplace=True)
print('dataset loaded with shape {}', format(data.shape))
return data
def tokenize(tweet):
try:
tweet = unicode(tweet.decode('utf-8').lower())
tokens = tokenizer.tokenize(tweet)
tokens = filter(lambda t: not t.startswith('@'), tokens)
tokens = filter(lambda t: not t.startswith('#'), tokens)
tokens = filter(lambda t: not t.startswith('http'), tokens)
return tokens
except:
return 'NC'
def postprocess(data, n=100):
data = data.head(n)
data['tokens'] = data['SentimentText'].progress_map(tokenize)
data = data[data.tokens != 'NC']
data.reset_index(inplace=True)
data.drop('index', inplace=True, axis=1)
return data
def labelizeTweets(tweets, label_type):
labelized = []
for i,v in enumerate(tweets):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
print(":::::::::::::::::::::::::")
return labelized
def labelizeTweets(tweets, label_type):
labelized = []
for i,v in tqdm(enumerate(tweets)):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
return labelized
def buildWordVector(tokens, size):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in tokens:
try:
vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
if __name__ == '__main__':
filename = './training.csv'
#n = 1000000
n = 100
n_dim = 200
data = ingest(filename)
#data = data.head(5)
data = postprocess(data, n)
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens), np.array(data.head(n).Sentiment), test_size=0.2)
print("training length X", len(x_train))
print("training length Y", len(y_train))
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')
print("jljkjkjlkjlj", len(x_train))
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
#tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.build_vocab([x.words for x in x_train])
#tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
print(tweet_w2v.most_similar('good'))
if True:
print('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)
test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
model.fit(train_vecs_w2v, y_train, epochs=20, batch_size=32, verbose=2)
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print (score[1])
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]]
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000]
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)
This is the error I am getting
C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
dataset loaded with shape {} (505, 2)
progress-bar: 100%|##########################################################################| 505/505 [00:00<?, ?it/s]
training length X 0
training length Y 0
0it [00:00, ?it/s]
0it [00:00, ?it/s]
jljkjkjlkjlj 0
Traceback (most recent call last):
File "Sentiment_Analysis.py", line 127, in <module>
tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
File "C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\models\word2vec.py", line 951, in train
raise RuntimeError("you must first build vocabulary before training the model")
RuntimeError: you must first build vocabulary before training the model
I had the same issue with the same code. There is absolutely no problem with the code on the website, but it returns an empty vocabulary no matter how you order it.
My workaround was that it runs smoothly when you run the same exact code in Python 2.7 instead of 3.x. However, if you do manage to port it successfully to Python 3.x, you have faster data/ memory access rates which is quite desirable.
Edit: Found the problem, now it works with Python 3 too. Edit the corresponding code segment to this and vocabulary should build without any issue.
def tokenize(tweet):
try:
tweet = unicode(tweet.decode('utf-8').lower())
tokens = tokenizer.tokenize(tweet)
tokens = list(filter(lambda t: not t.startswith('@'), tokens))
tokens = list(filter(lambda t: not t.startswith('#'), tokens))
tokens = list(filter(lambda t: not t.startswith('http'), tokens))
return tokens
except:
return 'NC'