Search code examples
pythontwitternltksentiment-analysisword2vec

Sentiment Analysis Code (word2vec) not properly working in my python version (vocabulary not built)


I have taken a code online to do sentiment analysis on twitter database. I tried running it and it gave me at the beginning error for printing, which I figured out that the newer version of python has changed its way to do print. I am getting error that shows my data is not filled in the array, if anyone has worked with python and has eagle eye to see where I am going wrong please help.

    import numpy as np 
    from copy import deepcopy
    from string import punctuation
    from random import shuffle
    import chardet
    from sklearn.manifold import TSNE
    from sklearn.preprocessing import scale


    import bokeh.plotting as bp
    from bokeh.models import HoverTool, BoxSelectTool
    from bokeh.plotting import figure, show, output_notebook

    import gensim
    from gensim.models.word2vec import Word2Vec 
    LabeledSentence = gensim.models.doc2vec.LabeledSentence 

    import pandas as pd 
    pd.options.mode.chained_assignment = None

    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")

    from nltk.tokenize import TweetTokenizer 
    tokenizer = TweetTokenizer()

    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer

    def ingest(filename):
        with open(filename, 'rb') as f:
            result = chardet.detect(f.read())
        data = pd.read_csv(filename, encoding=result['encoding'])
        data.drop(['ItemID', 'Date', 'Blank', 'SentimentSource'], axis=1, inplace=True)
        data = data[data.Sentiment.isnull() == False]
        data['Sentiment'] = data['Sentiment'].map({4:1, 0:0})
        data = data[data['SentimentText'].isnull() == False]
        data.reset_index(inplace=True)
        data.drop('index', axis=1, inplace=True)
        print('dataset loaded with shape {}', format(data.shape)) 

        return data

    def tokenize(tweet):
        try:
            tweet = unicode(tweet.decode('utf-8').lower())
            tokens = tokenizer.tokenize(tweet)
            tokens = filter(lambda t: not t.startswith('@'), tokens)
            tokens = filter(lambda t: not t.startswith('#'), tokens)
            tokens = filter(lambda t: not t.startswith('http'), tokens)
            return tokens
        except:
            return 'NC'

    def postprocess(data, n=100):
        data = data.head(n)
        data['tokens'] = data['SentimentText'].progress_map(tokenize)  
        data = data[data.tokens != 'NC']
        data.reset_index(inplace=True)
        data.drop('index', inplace=True, axis=1)
        return data


    def labelizeTweets(tweets, label_type):
        labelized = []
        for i,v in  enumerate(tweets):
            label = '%s_%s'%(label_type,i)
            labelized.append(LabeledSentence(v, [label]))
            print(":::::::::::::::::::::::::")
        return labelized


    def labelizeTweets(tweets, label_type):
        labelized = []
        for i,v in tqdm(enumerate(tweets)):
            label = '%s_%s'%(label_type,i)
            labelized.append(LabeledSentence(v, [label]))
        return labelized


    def buildWordVector(tokens, size):
        vec = np.zeros(size).reshape((1, size))
        count = 0.
        for word in tokens:
            try:
                vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
                count += 1.
            except KeyError: 

                continue
        if count != 0:
            vec /= count
        return vec



    if __name__ == '__main__':

        filename = './training.csv'

        #n = 1000000
        n = 100
        n_dim = 200

        data = ingest(filename)
        #data = data.head(5)
        data = postprocess(data, n)

        x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens), np.array(data.head(n).Sentiment), test_size=0.2)


        print("training length X", len(x_train))

        print("training length Y", len(y_train))


        x_train = labelizeTweets(x_train, 'TRAIN')
        x_test = labelizeTweets(x_test, 'TEST')

        print("jljkjkjlkjlj", len(x_train))

        tweet_w2v = Word2Vec(size=n_dim, min_count=10)
        #tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
        tweet_w2v.build_vocab([x.words for x in x_train])

        #tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
        tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)




        print(tweet_w2v.most_similar('good'))

        if True:
            print('building tf-idf matrix ...')
            vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
            matrix = vectorizer.fit_transform([x.words for x in x_train])
            tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
            print('vocab size :', len(tfidf))

            train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
            train_vecs_w2v = scale(train_vecs_w2v)

            test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
            test_vecs_w2v = scale(test_vecs_w2v)

            model = Sequential()
            model.add(Dense(32, activation='relu', input_dim=200))
            model.add(Dense(1, activation='sigmoid'))
            model.compile(optimizer='rmsprop',
                                        loss='binary_crossentropy',
                                        metrics=['accuracy'])

            model.fit(train_vecs_w2v, y_train, epochs=20, batch_size=32, verbose=2)

            score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
            print (score[1])

    output_notebook()
    plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
        x_axis_type=None, y_axis_type=None, min_border=1)

    word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]]

    tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
    tsne_w2v = tsne_model.fit_transform(word_vectors)

    tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
    tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000]

    plot_tfidf.scatter(x='x', y='y', source=tsne_df)
    hover = plot_tfidf.select(dict(type=HoverTool))
    hover.tooltips={"word": "@words"}
    show(plot_tfidf)

This is the error I am getting

    C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
dataset loaded with shape {} (505, 2)
progress-bar: 100%|##########################################################################| 505/505 [00:00<?, ?it/s]
training length X 0
training length Y 0
0it [00:00, ?it/s]
0it [00:00, ?it/s]
jljkjkjlkjlj 0
Traceback (most recent call last):
  File "Sentiment_Analysis.py", line 127, in <module>
    tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
  File "C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\models\word2vec.py", line 951, in train
    raise RuntimeError("you must first build vocabulary before training the model")
RuntimeError: you must first build vocabulary before training the model

Solution

  • I had the same issue with the same code. There is absolutely no problem with the code on the website, but it returns an empty vocabulary no matter how you order it.

    My workaround was that it runs smoothly when you run the same exact code in Python 2.7 instead of 3.x. However, if you do manage to port it successfully to Python 3.x, you have faster data/ memory access rates which is quite desirable.

    Edit: Found the problem, now it works with Python 3 too. Edit the corresponding code segment to this and vocabulary should build without any issue.

    def tokenize(tweet):
            try:
                tweet = unicode(tweet.decode('utf-8').lower())
                tokens = tokenizer.tokenize(tweet)
                tokens = list(filter(lambda t: not t.startswith('@'), tokens))
                tokens = list(filter(lambda t: not t.startswith('#'), tokens))
                tokens = list(filter(lambda t: not t.startswith('http'), tokens))
                return tokens
            except:
                return 'NC'