python pytorch lstm recurrent-neural-network

Understanding LSTM with a simple dataset

I wanted to make sure I understand LSTM so I implemented a dummy example using Pytorch framework. As an input, I use sequences of consecutive numbers of length 10 and the value to predict is always the last number of sequence + 1. For instance:
x = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
y = 16

Since it's a very simple forecasting task, I expected the model to work well but I observe very poor performances. The model predicts a constant value by batch that keeps increasing during the training process.

I am wondering what I am missing. Below is the code I've made - any help would be highly appreciated.

from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch

class MyDataset(Dataset):

    def __init__(self):
        pass

    def __getitem__(self, index):
        x = torch.tensor([index-9,index-8,index-7,index-6,index-5,index-4,index-3,index-2,index-1,index])
        y = torch.tensor(index + 1)
        return x,y

    def __len__(self):
        return 1000

class LSTM(nn.Module):
    def __init__(self, hidden_layer_size=1, batch_size = 1):

        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.batch_size = batch_size 
        self.lstm = nn.LSTM(1, hidden_layer_size)
        self.linear = nn.Linear(10, 1)
        self.hidden_cell = (torch.zeros(1,self.batch_size,self.hidden_layer_size),
                            torch.zeros(1,self.batch_size,self.hidden_layer_size))

    def forward(self, input_seq):

        lstm_out, self.hidden_cell = self.lstm(input_seq.view(10 ,self.batch_size, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.squeeze().T)
        return predictions

batch_size = 32
epochs = 1000

train = MyDataset()
sampler = RandomSampler(train)
train_dataloader = DataLoader(train, sampler=sampler, batch_size= batch_size , drop_last = True)

model = LSTM(batch_size = batch_size)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for e in range(epochs):
    for step, batch in enumerate(train_dataloader) :

        seq, labels = batch
        optimizer.zero_grad()

        model.hidden_cell = (torch.zeros(1, batch_size, model.hidden_layer_size),
                             torch.zeros(1, batch_size, model.hidden_layer_size))

        y_pred = model(seq.float())

        print(y_pred)

        single_loss = loss_function(y_pred, labels.float())
        single_loss.backward()
        optimizer.step()

Solution

There are multiple issues in your forward function. Take a look at the input that you are passing to the LSTM:

input_seq = input_seq.view(10 ,self.batch_size, -1)
print(input_seq[:, 0])

>>> tensor([[168.],
        [ 21.],
        [450.],
        [436.],
        [789.],
        [941.],
        [ -7.],
        [811.],
        [789.],
        [992.]])

This is a series of random numbers. You either have to transpose the input_seq or even better, pass batch_first=True to the LSTM constructor and just unsqueeze the input_seq before passing it to the LSTM.

You also have to update the lstm_out, the only operation that is needed now is to reshape it to [batch_size x (10 * hidden_size)].

Lastly, you need to squeeze the output of the linear layer.

Aside from those, the hidden size of the LSTM is too small, use 10 (or even 100) instead of one, only then the model converges in 1000 epochs. Here is the updated code:

class LSTM(nn.Module):
    def __init__(self, hidden_layer_size=100, batch_size = 1):

        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.batch_size = batch_size 
        self.lstm = nn.LSTM(1, hidden_layer_size, batch_first=True)
        self.linear = nn.Linear(10 * hidden_layer_size, 1)
        self.hidden_cell = (torch.zeros(1,self.batch_size,self.hidden_layer_size),
                            torch.zeros(1,self.batch_size,self.hidden_layer_size))

    def forward(self, input_seq):
        batch_size = input_seq.size(0)
        input_seq = input_seq.unsqueeze(2)
        lstm_out, self.hidden_cell = self.lstm(input_seq, self.hidden_cell)
        lstm_out = lstm_out.reshape(batch_size, -1)
        predictions = self.linear(lstm_out).squeeze()
        return predictions