I am having a problem on an implementation of LSTM. I am not sure if I have the right implementation or this is just an overfitting problem. I am doing essay grading using a LSTM, scoring text with score from 0 - 10 (or other range of score). I am using the ASAP kaggle competition data as one of the training data.
However, the main goal is to achieve good performance on a private dataset, with around 500 samples. The 500 samples includes validation and training set. I have previously done some experiment and got the model to work, but after fiddling with something, the model doesn't fit anymore. The model does not improve at all. I have also re-implemented the code in a cleaner manner with much more obejct oriented code and still can't reproduce my previous result.
However, I am getting the model to fit to my data, just there is tremendous overfitting. I am not sure if this is an implementation problem of some sort or just overfitting, but I cannot get the model to work. The maximum I can get it to is 0.35 kappa using LSTM on the ASAP data essay set 1. For some bizarre reason, I can get a single layer fully connected model to have 0.75 kappa. I think this is an implementation problem but I am not sure.
Here is my old code:
import gensim
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import cohen_kappa_score
from torch import nn
import torch.utils.data as data_utils
from torch.optim import Adam
from dataset import AESDataset
from network import Network
from optimizer import Ranger
from qwk import quadratic_weighted_kappa, kappa
batch_size = 32
device = "cuda:0"
torch.manual_seed(1000)
# Load data from csv
file_name = "data/data_new.csv"
data = pd.read_csv(file_name)
arr = data.to_numpy()
text = arr[:, :2]
text = [str(line[0]) + str(line[1]) for line in text]
text = [gensim.utils.simple_preprocess(line) for line in text]
score = arr[:,2]
score = [sco*6 for sco in score]
score = np.asarray(score, dtype=int)
train_dataset = AESDataset(text_arr=text[:400], scores=score[:400])
test_dataset = AESDataset(text_arr=text[400:], scores=score[400:])
score = torch.tensor(score).view(-1,1).long().to(device)
train_loader = data_utils.DataLoader(train_dataset,shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = data_utils.DataLoader(test_dataset,shuffle=True,batch_size=batch_size, drop_last=True)
out_class = 61
epochs = 1000
model = Network(out_class).to(device)
model.load_state_dict(torch.load("model/best_model"))
y_onehot = torch.FloatTensor(batch_size, out_class).to(device)
optimizer = Adam(model.parameters())
criti = torch.nn.CrossEntropyLoss()
# model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
step = 0
for i in range(epochs):
#Testing
if i % 1 == 0:
total_loss = 0
total_kappa = 0
total_batches = 0
model.eval()
for (text, score) in test_loader:
out = model(text)
out_score = torch.argmax(out, 1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())
score = score.view(-1)
loss = criti(out, score.view(-1))
total_loss += loss
total_kappa += kappa_l
total_batches += 1
print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")
with open(f"model/epoch_{i}", "wb") as f:
torch.save(model.state_dict(),f)
model.train()
#Training
for (text, score) in train_loader:
optimizer.zero_grad()
step += 1
out = model(text)
out_score = torch.argmax(out,1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())
loss = criti(out, score.view(-1))
print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")
loss.backward()
optimizer.step()
import gensim
import torch
import numpy as np
class AESDataset(torch.utils.data.Dataset):
def __init__(self, text_arr, scores):
self.data = text_arr
self.scores = scores
self.w2v_model = ("w2vec_model_all")
self.max_len = 500
def __getitem__(self, item):
vector = []
essay = self.data[item]
pad_vec = [1 for i in range(300)]
for i in range(self.max_len - len(essay)):
vector.append(pad_vec)
for word in essay:
word_vec = pad_vec
try:
word_vec = self.w2v_model[word]
except:
#print(f"Skipping word as word {word} not in dictionary")
word_vec = pad_vec
vector.append(word_vec)
#print(len(vector))
vector = np.stack(vector)
tensor = torch.tensor(vector[:self.max_len]).float().to("cuda")
score = self.scores[item]
score = torch.tensor(score).long().to("cuda").view(1)
return tensor, score
def __len__(self):
return len(self.scores)
import torch.nn as nn
import torch
import torch.nn.functional as F
class Network(nn.Module):
def __init__(self, output_size):
super(Network, self).__init__()
self.lstm = nn.LSTM(300,500,1, batch_first=True)
self.dropout = nn.Dropout(p=0.5)
#self.l2 = nn.L2
self.linear = nn.Linear(500,output_size)
def forward(self,x):
x, _ = self.lstm(x)
x = x[:,-1,:]
x = self.dropout(x)
x = self.linear(x)
return x
My new code: https://github.com/Clement-Hui/EssayGrading
I think the problem is in the training code since you are using LSTM you are supposed to flush down the hidden and cell state after every epoch and detach it from the computation graph after each batch.
network.py
import torch.nn as nn
import torch
import torch.nn.functional as F
class Network(nn.Module):
def __init__(self, output_size):
super(Network, self).__init__()
self.lstm = nn.LSTM(300,500,1, batch_first=True)
self.dropout = nn.Dropout(p=0.5)
#self.l2 = nn.L2
self.linear = nn.Linear(500,output_size)
def forward(self,x,hidden):
x, hidden = self.lstm(x,hidden)
x = x.contiguous().view(-1, 500)
x = self.dropout(x)
x = self.linear(x)
return x , hidden
def init_hidden(self,batch_size):
weights = next(self.parameters()).data
hidden = (weights.new(1 , batch_size,500).zero_().cuda(),
weights.new(1 , batch_size,500).zero_().cuda())
return hidden
train.py
# your code for intializing the model and data and all other stuff
for i in range(epochs):
#Testing
if i % 1 == 0:
total_loss = 0
total_kappa = 0
total_batches = 0
model.eval()
val_h = model.init_hidden(batch_size) # intialize the hidden state
for (text, score) in test_loader:
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
val_h = tuple([each.data for each in val_h])
out , val_h = model(text,val_h)
out_score = torch.argmax(out, 1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())
score = score.view(-1)
loss = criti(out, score.view(-1))
total_loss += loss
total_kappa += kappa_l
total_batches += 1
print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")
with open(f"model/epoch_{i}", "wb") as f:
torch.save(model.state_dict(),f)
model.train()
#Training
h = model.init_hidden(batch_size) # intialize the hidden state
for (text, score) in train_loader:
optimizer.zero_grad()
step += 1
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
h = tuple([each.data for each in h])
out , h = model(text,h)
out_score = torch.argmax(out,1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())
loss = criti(out, score.view(-1))
print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")
loss.backward()
optimizer.step()
Do let me know if the changes mentioned works or not.