python artificial-intelligence pytorch lstm data-science

LSTM implementation / overfitting

I am having a problem on an implementation of LSTM. I am not sure if I have the right implementation or this is just an overfitting problem. I am doing essay grading using a LSTM, scoring text with score from 0 - 10 (or other range of score). I am using the ASAP kaggle competition data as one of the training data.

However, the main goal is to achieve good performance on a private dataset, with around 500 samples. The 500 samples includes validation and training set. I have previously done some experiment and got the model to work, but after fiddling with something, the model doesn't fit anymore. The model does not improve at all. I have also re-implemented the code in a cleaner manner with much more obejct oriented code and still can't reproduce my previous result.

However, I am getting the model to fit to my data, just there is tremendous overfitting. I am not sure if this is an implementation problem of some sort or just overfitting, but I cannot get the model to work. The maximum I can get it to is 0.35 kappa using LSTM on the ASAP data essay set 1. For some bizarre reason, I can get a single layer fully connected model to have 0.75 kappa. I think this is an implementation problem but I am not sure.

Here is my old code:

train.py

import gensim
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import cohen_kappa_score
from torch import nn
import torch.utils.data as data_utils
from torch.optim import Adam

from dataset import AESDataset
from network import Network

from optimizer import Ranger
from qwk import quadratic_weighted_kappa, kappa

batch_size = 32

device = "cuda:0"
torch.manual_seed(1000)
# Load data from csv
file_name = "data/data_new.csv"
data = pd.read_csv(file_name)
arr = data.to_numpy()
text = arr[:, :2]
text = [str(line[0]) + str(line[1]) for line in text]
text = [gensim.utils.simple_preprocess(line) for line in text]

score = arr[:,2]

score = [sco*6 for sco in score]
score = np.asarray(score, dtype=int)


train_dataset = AESDataset(text_arr=text[:400], scores=score[:400])
test_dataset = AESDataset(text_arr=text[400:], scores=score[400:])

score = torch.tensor(score).view(-1,1).long().to(device)


train_loader = data_utils.DataLoader(train_dataset,shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = data_utils.DataLoader(test_dataset,shuffle=True,batch_size=batch_size, drop_last=True)

out_class = 61

epochs = 1000

model = Network(out_class).to(device)
model.load_state_dict(torch.load("model/best_model"))
y_onehot = torch.FloatTensor(batch_size, out_class).to(device)
optimizer = Adam(model.parameters())
criti = torch.nn.CrossEntropyLoss()
# model, optimizer = amp.initialize(model, optimizer, opt_level="O2")


step = 0

for i in range(epochs):
    #Testing
    if i % 1 == 0:
        total_loss = 0
        total_kappa = 0
        total_batches = 0
        model.eval()
        for (text, score) in test_loader:

            out = model(text)
            out_score = torch.argmax(out, 1)
            y_onehot.zero_()
            y_onehot.scatter_(1, score, 1)
            kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())
            score = score.view(-1)
            loss = criti(out, score.view(-1))
            total_loss += loss
            total_kappa += kappa_l
            total_batches += 1
        print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")
        with open(f"model/epoch_{i}", "wb") as f:
            torch.save(model.state_dict(),f)
        model.train()
    #Training

    for (text, score) in train_loader:

        optimizer.zero_grad()
        step += 1
        out = model(text)
        out_score = torch.argmax(out,1)
        y_onehot.zero_()
        y_onehot.scatter_(1, score, 1)
        kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())
        loss = criti(out, score.view(-1))
        print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")
        loss.backward()
        optimizer.step()

dataset.py

import gensim
import torch
import numpy as np

class AESDataset(torch.utils.data.Dataset):
    def __init__(self, text_arr, scores):
        self.data = text_arr
        self.scores = scores
        self.w2v_model = ("w2vec_model_all")
        self.max_len = 500
    def __getitem__(self, item):
        vector = []
        essay = self.data[item]

        pad_vec = [1 for i in range(300)]
        for i in range(self.max_len - len(essay)):
            vector.append(pad_vec)
        for word in essay:
            word_vec = pad_vec
            try:
                word_vec = self.w2v_model[word]
            except:
                #print(f"Skipping word as word {word} not in dictionary")
                word_vec = pad_vec


            vector.append(word_vec)
        #print(len(vector))
        vector = np.stack(vector)
        tensor = torch.tensor(vector[:self.max_len]).float().to("cuda")
        score = self.scores[item]
        score = torch.tensor(score).long().to("cuda").view(1)

        return tensor, score

    def __len__(self):
        return len(self.scores)

network.py

import torch.nn as nn
import torch

import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self, output_size):
        super(Network, self).__init__()
        self.lstm = nn.LSTM(300,500,1, batch_first=True)
        self.dropout = nn.Dropout(p=0.5)
        #self.l2 = nn.L2
        self.linear = nn.Linear(500,output_size)





    def forward(self,x):
        x, _ = self.lstm(x)
        x = x[:,-1,:]
        x = self.dropout(x)
        x = self.linear(x)


        return x

My new code: https://github.com/Clement-Hui/EssayGrading

Solution

I think the problem is in the training code since you are using LSTM you are supposed to flush down the hidden and cell state after every epoch and detach it from the computation graph after each batch.

network.py

import torch.nn as nn
import torch

import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self, output_size):
        super(Network, self).__init__()
        self.lstm = nn.LSTM(300,500,1, batch_first=True)
        self.dropout = nn.Dropout(p=0.5)
        #self.l2 = nn.L2
        self.linear = nn.Linear(500,output_size)

    def forward(self,x,hidden):
        x, hidden = self.lstm(x,hidden)
        x = x.contiguous().view(-1, 500)
        x = self.dropout(x)
        x = self.linear(x)
        return x , hidden

    def init_hidden(self,batch_size):
        weights = next(self.parameters()).data
        hidden = (weights.new(1 , batch_size,500).zero_().cuda(),
                  weights.new(1 , batch_size,500).zero_().cuda())
        return hidden

train.py

# your code for intializing the model and data and all other stuff
for i in range(epochs):

    #Testing
    if i % 1 == 0:
        total_loss = 0
        total_kappa = 0
        total_batches = 0
        model.eval()
        val_h  = model.init_hidden(batch_size) # intialize the hidden state
        for (text, score) in test_loader:
           # Creating new variables for the hidden state, otherwise
           # we'd backprop through the entire training history
            val_h = tuple([each.data for each in val_h]) 
            out ,  val_h  = model(text,val_h)
            out_score = torch.argmax(out, 1)
            y_onehot.zero_()
            y_onehot.scatter_(1, score, 1)
            kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())
            score = score.view(-1)
            loss = criti(out, score.view(-1))
            total_loss += loss
            total_kappa += kappa_l
            total_batches += 1
        print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")
        with open(f"model/epoch_{i}", "wb") as f:
            torch.save(model.state_dict(),f)
    model.train()

    #Training
    h =  model.init_hidden(batch_size) # intialize the hidden state
    for (text, score) in train_loader:
        optimizer.zero_grad()
        step += 1
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])
        out , h  = model(text,h)
        out_score = torch.argmax(out,1)
        y_onehot.zero_()
        y_onehot.scatter_(1, score, 1)
        kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())
        loss = criti(out, score.view(-1))
        print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")
        loss.backward()
        optimizer.step()

Do let me know if the changes mentioned works or not.