python pytorch lstm recurrent-neural-network

Expected hidden[0] size (2, 8, 256), got [8, 256]

I have correct shape of hidden layer for printing as below.

print(h0.shape)
print(x.shape)

torch.Size([2, 8, 256])
torch.Size([8, 300, 300])

But I still have error as Expected hidden[0] size (2, 8, 256), got [8, 256]

What could be wrong?

The whole code is as follows.

import torch
import torch.nn as nn
import torchvision
import matplotlib.pyplot as plt
import torchvision.transforms as tt
from torchvision.datasets import ImageFolder
from PIL import Image
import numpy as np
from torch.autograd import Variable

seq_len = input_size
hidden_size = 256 #size of hidden layers
num_classes = 5
num_epochs = 20
batch_size = 8
learning_rate = 0.001
# Fully connected neural network with one hidden layer
num_layers = 2 # 2 RNN layers are stacked  
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)#batch must have first dimension
        #our inpyt needs to have shape
        #x -> (batch_size, seq, input_size)
        self.fc = nn.Linear(hidden_size, num_classes)#this fc is after RNN. So needs the last hidden size of RNN

    def forward(self, x):
        #according to ducumentation of RNN in pytorch
        #rnn needs input, h_0 for inputs at RNN (h_0 is initial hidden state)

        #the following one is initial hidden layer
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)#first one is number of layers and second one is batch size
        #output has two outputs. The first tensor contains the output features of the hidden last layer for all time steps
        #the second one is hidden state f
        print(h0.shape)
        print(x.shape)
        out, _ = self.lstm(x, h0)
        print(out.shape)
        #output has batch_size, seq_len, hidden size
        #we need to decode hidden state only the last time step
        #out (N, 30, 128)
        #Since we need only the last time step
        #Out (N, 128)
        out = out[:, -1, :] #-1 for last time step, take all for N and 128
        out = self.fc(out)
        return out
        
        

stacked_lstm_model = LSTM(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()#cross entropy has softmax at output
optimizer = torch.optim.Adam(stacked_lstm_model.parameters(), lr=learning_rate) #optimizer used gradient optimization using Adam 

# Train the model
n_total_steps = len(train_dl)
for epoch in range(num_epochs):
    t_losses=[]
    for i, (images, labels) in enumerate(train_dl):  
        # origin shape: [8, 1, 300, 300]
        # resized: [8, 300, 300]
        images = images.reshape(-1, seq_len, input_size).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = stacked_lstm_model(images)
        loss = criterion(outputs, labels)
        t_losses.append(loss)
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')
    avgd_trainloss = sum(t_losses)/len(t_losses)
    acc=0
    v_losses=[]
    with torch.no_grad():
      n_correct = 0
      n_samples = 0      
      for v_images, v_labels in valid_dl:
          v_images = v_images.reshape(-1, seq_len, input_size).to(device)
          v_labels = v_labels.to(device)
          v_outputs = stacked_lstm_model(v_images)
          v_loss = criterion(v_outputs, v_labels)
          v_losses.append(v_loss)
          # max returns (value ,index)
          _, v_predicted = torch.max(v_outputs.data, 1)
          n_samples += v_labels.size(0)
          n_correct += (v_predicted == v_labels).sum().item()

      acc = 100.0 * n_correct / n_samples
    avgd_validloss = sum(v_losses)/len(v_losses)
    print (f'Epoch [{epoch+1}/{num_epochs}], Train loss: {avgd_trainloss.item():.4f}, Valid loss: {avgd_validloss.item():.4f}, Valid accu: {acc.item():.2f}')

# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for images, labels in test_dl:
        images = images.reshape(-1, seq_len, input_size).to(device)
        labels = labels.to(device)
        outputs = stacked_lstm_model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on test images: {acc} %')

Solution

The LSTM requires two hidden states, not one. So instead of

h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

use

h0 = (torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device), torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device))

So you need two hidden states in a tuple.