I am working on tensorflow and some high level APIs on it like tflearn.
What I am trying to do here is to use lstm on IMDB data for sentiment analysis. There is a sample code in following link https://github.com/tflearn/tflearn/blob/master/examples/nlp/lstm.py
However it uses a preprocessed data, but I wanna use my own IMDB raw data (downloaded from http://ai.stanford.edu/~amaas/data/sentiment/)
Here is the code that I updated for the sentiment analysis, all the intermediate steps seem correct but the accuracy is not stable (as you can see results below). When I print the predictions at the end, I see that the probabilities for each class is very very close (like [[0.4999946355819702, 0.5000053644180298], [0.5000001192092896, 0.49999988079071045], [0.49999362230300903, 0.5000064373016357], [0.49999985098838806, 0.5000001192092896]]).
I don't think that the problem is overfitting, since when I try to predict train data again, the result is as above. I think I am missing some point or doing something wrong.
Any help is appreciated, Thanks
# -*- coding: utf-8 -*-
from __future__ import division, print_function, absolute_import
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
import string
import numpy as nm
import codecs
import re
import collections
import math
import tensorflow as tf
import random
import glob
allWords = []
allDocuments = []
allLabels = []
def readFile(fileName, allWords):
file = codecs.open(fileName, encoding='utf-8')
for line in file:
line = line.lower().encode('utf-8')
words = line.split()
for word in words:
word = word.translate(None, string.punctuation)
if word != '':
allWords.append(word)
file.close()
def readFileToConvertWordsToIntegers(dictionary, fileName, allDocuments, allLabels, label):
file = codecs.open(fileName, encoding='utf-8')
document = []
for line in file:
line = line.lower().encode('utf-8')
words = line.split()
for word in words:
word = word.translate(None, string.punctuation)
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
document.append(index)
allDocuments.append(document)
allLabels.append(label)
file.close()
vocabulary_size = 10000
def build_dataset(words):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count = unk_count + 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return dictionary, reverse_dictionary
fileList = glob.glob("/Users/inanc/Desktop/aclImdb/train/neg/*.txt")
for file in fileList:
readFile(file, allWords)
fileList = glob.glob("/Users/inanc/Desktop/aclImdb/test/train/*.txt")
for file in fileList:
readFile(file, allWords)
print(len(allWords))
dictionary, reverse_dictionary = build_dataset(allWords)
del allWords # Hint to reduce memory.
print(len(dictionary))
fileList = glob.glob("/Users/inanc/Desktop/aclImdb/train/neg/*.txt")
for file in fileList:
readFileToConvertWordsToIntegers(dictionary, file, allDocuments, allLabels, 0)
fileList = glob.glob("/Users/inanc/Desktop/aclImdb/train/pos/*.txt")
for file in fileList:
readFileToConvertWordsToIntegers(dictionary, file, allDocuments, allLabels, 1)
print(len(allDocuments))
print(len(allLabels))
c = list(zip(allDocuments, allLabels)) # shuffle them partitioning
random.shuffle(c)
allDocuments, allLabels = zip(*c)
trainX = allDocuments[:22500]
testX = allDocuments[22500:]
trainY = allLabels[:22500]
testY = allLabels[22500:]
#counter=collections.Counter(trainY)
#print(counter)
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
# Network building
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=vocabulary_size, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
loss='categorical_crossentropy')
# Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
batch_size=32)
predictions = model.predict(trainX)
print(predictions)
Results:
--
Training Step: 704 | total loss: 1.38629
Training Step: 704 | total loss: 1.38629: 0.4698 | val_loss: 1.38629 - val_acc:| Adam | epoch: 001 | loss: 1.38629 - acc: 0.4698 | val_loss: 1.38629 - val_acc: 0.4925 -- iter: 22500/22500
--
Training Step: 1408 | total loss: 1.38629
Training Step: 1408 | total loss: 1.38629 0.8110 | val_loss: 1.38629 - val_acc:| Adam | epoch: 002 | loss: 1.38629 - acc: 0.8110 | val_loss: 1.38629 - val_acc: 0.9984 -- iter: 22500/22500
--
Training Step: 1620 | total loss: 1.38629
Training Step: 2112 | total loss: 1.38629 0.8306 -- iter: 06784/22500
Training Step: 2112 | total loss: 1.38629 0.6303 | val_loss: 1.38629 - val_acc:| Adam | epoch: 003 | loss: 1.38629 - acc: 0.6303 | val_loss: 1.38629 - val_acc: 0.7382 -- iter: 22500/22500
--
Training Step: 2816 | total loss: 1.38629
Training Step: 2816 | total loss: 1.38629 0.5489 | val_loss: 1.38629 - val_acc:| Adam | epoch: 004 | loss: 1.38629 - acc: 0.5489 | val_loss: 1.38629 - val_acc: 0.2904 -- iter: 22500/22500
--
Training Step: 3520 | total loss: 1.38629
Training Step: 3520 | total loss: 1.38629 0.4848 | val_loss: 1.38629 - val_acc:| Adam | epoch: 005 | loss: 1.38629 - acc: 0.4848 | val_loss: 1.38629 - val_acc: 0.7828 -- iter: 22500/22500
--
Training Step: 4224 | total loss: 1.38629
Training Step: 4224 | total loss: 1.38629 0.5233 | val_loss: 1.38629 - val_acc:| Adam | epoch: 006 | loss: 1.38629 - acc: 0.5233 | val_loss: 1.38629 - val_acc: 0.9654 -- iter: 22500/22500
--
Training Step: 4928 | total loss: 1.38629
Training Step: 4928 | total loss: 1.38629 0.4400 | val_loss: 1.38629 - val_acc:| Adam | epoch: 007 | loss: 1.38629 - acc: 0.4400 | val_loss: 1.38629 - val_acc: 0.6725 -- iter: 22500/22500
--
Training Step: 5632 | total loss: 1.38629
Training Step: 5632 | total loss: 1.38629 0.4319 | val_loss: 1.38629 - val_acc:| Adam | epoch: 008 | loss: 1.38629 - acc: 0.4319 | val_loss: 1.38629 - val_acc: 0.5808 -- iter: 22500/22500
--
Training Step: 6336 | total loss: 1.38629
Training Step: 6336 | total loss: 1.38629 0.4765 | val_loss: 1.38629 - val_acc:| Adam | epoch: 009 | loss: 1.38629 - acc: 0.4765 | val_loss: 1.38629 - val_acc: 0.4833 -- iter: 22500/22500
--
Training Step: 7040 | total loss: 1.38629
Training Step: 7040 | total loss: 1.38629 0.5203 | val_loss: 1.38629 - val_acc:| Adam | epoch: 010 | loss: 1.38629 - acc: 0.5203 | val_loss: 1.38629 - val_acc: 0.2373 -- iter: 22500/22500
Ohh, it's my bad. I typed
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
lines twice, so only one category exists afterwards. After I removed the repeated lines, problem has been solved.