Search code examples
tensorflowdeep-learningembeddingword2veclstm

sentiment analysis on IMDB data using tflearn (lstm -Tensorflow)


I am working on tensorflow and some high level APIs on it like tflearn.

What I am trying to do here is to use lstm on IMDB data for sentiment analysis. There is a sample code in following link https://github.com/tflearn/tflearn/blob/master/examples/nlp/lstm.py

However it uses a preprocessed data, but I wanna use my own IMDB raw data (downloaded from http://ai.stanford.edu/~amaas/data/sentiment/)

Here is the code that I updated for the sentiment analysis, all the intermediate steps seem correct but the accuracy is not stable (as you can see results below). When I print the predictions at the end, I see that the probabilities for each class is very very close (like [[0.4999946355819702, 0.5000053644180298], [0.5000001192092896, 0.49999988079071045], [0.49999362230300903, 0.5000064373016357], [0.49999985098838806, 0.5000001192092896]]).

I don't think that the problem is overfitting, since when I try to predict train data again, the result is as above. I think I am missing some point or doing something wrong.

Any help is appreciated, Thanks

# -*- coding: utf-8 -*-
from __future__ import division, print_function, absolute_import

import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
import string
import numpy as nm
import codecs
import re
import collections
import math
import tensorflow as tf
import random
import glob

allWords = []
allDocuments = []
allLabels = []

def readFile(fileName, allWords):


    file = codecs.open(fileName, encoding='utf-8')

    for line in file:
        line = line.lower().encode('utf-8')
        words = line.split()
        for word in words:
            word = word.translate(None, string.punctuation)
            if word != '':
                allWords.append(word)

    file.close()


def readFileToConvertWordsToIntegers(dictionary, fileName, allDocuments, allLabels, label):

    file = codecs.open(fileName, encoding='utf-8')
    document = []
    for line in file:
        line = line.lower().encode('utf-8')
        words = line.split()
        for word in words:
            word = word.translate(None, string.punctuation)
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0  # dictionary['UNK'] 
            document.append(index)
        allDocuments.append(document)
        allLabels.append(label)

    file.close()


vocabulary_size = 10000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return dictionary, reverse_dictionary



fileList = glob.glob("/Users/inanc/Desktop/aclImdb/train/neg/*.txt")
for file in fileList:
    readFile(file, allWords)

fileList = glob.glob("/Users/inanc/Desktop/aclImdb/test/train/*.txt")
for file in fileList:
    readFile(file, allWords)

print(len(allWords))

dictionary, reverse_dictionary = build_dataset(allWords)
del allWords  # Hint to reduce memory.

print(len(dictionary))

fileList = glob.glob("/Users/inanc/Desktop/aclImdb/train/neg/*.txt")
for file in fileList:
    readFileToConvertWordsToIntegers(dictionary, file, allDocuments, allLabels, 0)

fileList = glob.glob("/Users/inanc/Desktop/aclImdb/train/pos/*.txt")
for file in fileList:
    readFileToConvertWordsToIntegers(dictionary, file, allDocuments, allLabels, 1)

print(len(allDocuments))
print(len(allLabels))

c = list(zip(allDocuments, allLabels)) # shuffle them partitioning

random.shuffle(c)

allDocuments, allLabels = zip(*c)

trainX = allDocuments[:22500]
testX = allDocuments[22500:]

trainY = allLabels[:22500]
testY = allLabels[22500:]


#counter=collections.Counter(trainY)
#print(counter)


trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Network building
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=vocabulary_size, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

# Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
          batch_size=32)
predictions = model.predict(trainX)
print(predictions)

Results:

--
Training Step: 704  | total loss: 1.38629
Training Step: 704  | total loss: 1.38629: 0.4698 | val_loss: 1.38629 - val_acc:| Adam | epoch: 001 | loss: 1.38629 - acc: 0.4698 | val_loss: 1.38629 - val_acc: 0.4925 -- iter: 22500/22500
--
Training Step: 1408  | total loss: 1.38629
Training Step: 1408  | total loss: 1.38629 0.8110 | val_loss: 1.38629 - val_acc:| Adam | epoch: 002 | loss: 1.38629 - acc: 0.8110 | val_loss: 1.38629 - val_acc: 0.9984 -- iter: 22500/22500
--
Training Step: 1620  | total loss: 1.38629
Training Step: 2112  | total loss: 1.38629 0.8306 -- iter: 06784/22500
Training Step: 2112  | total loss: 1.38629 0.6303 | val_loss: 1.38629 - val_acc:| Adam | epoch: 003 | loss: 1.38629 - acc: 0.6303 | val_loss: 1.38629 - val_acc: 0.7382 -- iter: 22500/22500
--
Training Step: 2816  | total loss: 1.38629
Training Step: 2816  | total loss: 1.38629 0.5489 | val_loss: 1.38629 - val_acc:| Adam | epoch: 004 | loss: 1.38629 - acc: 0.5489 | val_loss: 1.38629 - val_acc: 0.2904 -- iter: 22500/22500
--
Training Step: 3520  | total loss: 1.38629
Training Step: 3520  | total loss: 1.38629 0.4848 | val_loss: 1.38629 - val_acc:| Adam | epoch: 005 | loss: 1.38629 - acc: 0.4848 | val_loss: 1.38629 - val_acc: 0.7828 -- iter: 22500/22500
--
Training Step: 4224  | total loss: 1.38629
Training Step: 4224  | total loss: 1.38629 0.5233 | val_loss: 1.38629 - val_acc:| Adam | epoch: 006 | loss: 1.38629 - acc: 0.5233 | val_loss: 1.38629 - val_acc: 0.9654 -- iter: 22500/22500
--
Training Step: 4928  | total loss: 1.38629
Training Step: 4928  | total loss: 1.38629 0.4400 | val_loss: 1.38629 - val_acc:| Adam | epoch: 007 | loss: 1.38629 - acc: 0.4400 | val_loss: 1.38629 - val_acc: 0.6725 -- iter: 22500/22500
--
Training Step: 5632  | total loss: 1.38629
Training Step: 5632  | total loss: 1.38629 0.4319 | val_loss: 1.38629 - val_acc:| Adam | epoch: 008 | loss: 1.38629 - acc: 0.4319 | val_loss: 1.38629 - val_acc: 0.5808 -- iter: 22500/22500
--
Training Step: 6336  | total loss: 1.38629
Training Step: 6336  | total loss: 1.38629 0.4765 | val_loss: 1.38629 - val_acc:| Adam | epoch: 009 | loss: 1.38629 - acc: 0.4765 | val_loss: 1.38629 - val_acc: 0.4833 -- iter: 22500/22500
--
Training Step: 7040  | total loss: 1.38629
Training Step: 7040  | total loss: 1.38629 0.5203 | val_loss: 1.38629 - val_acc:| Adam | epoch: 010 | loss: 1.38629 - acc: 0.5203 | val_loss: 1.38629 - val_acc: 0.2373 -- iter: 22500/22500

Solution

  • Ohh, it's my bad. I typed

    trainY = to_categorical(trainY, nb_classes=2) 
    testY = to_categorical(testY, nb_classes=2) 
    

    lines twice, so only one category exists afterwards. After I removed the repeated lines, problem has been solved.