I have correct shape of hidden layer for printing as below.
print(h0.shape)
print(x.shape)
torch.Size([2, 8, 256])
torch.Size([8, 300, 300])
But I still have error as Expected hidden[0] size (2, 8, 256), got [8, 256]
What could be wrong?
The whole code is as follows.
import torch
import torch.nn as nn
import torchvision
import matplotlib.pyplot as plt
import torchvision.transforms as tt
from torchvision.datasets import ImageFolder
from PIL import Image
import numpy as np
from torch.autograd import Variable
seq_len = input_size
hidden_size = 256 #size of hidden layers
num_classes = 5
num_epochs = 20
batch_size = 8
learning_rate = 0.001
# Fully connected neural network with one hidden layer
num_layers = 2 # 2 RNN layers are stacked
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(LSTM, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)#batch must have first dimension
#our inpyt needs to have shape
#x -> (batch_size, seq, input_size)
self.fc = nn.Linear(hidden_size, num_classes)#this fc is after RNN. So needs the last hidden size of RNN
def forward(self, x):
#according to ducumentation of RNN in pytorch
#rnn needs input, h_0 for inputs at RNN (h_0 is initial hidden state)
#the following one is initial hidden layer
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)#first one is number of layers and second one is batch size
#output has two outputs. The first tensor contains the output features of the hidden last layer for all time steps
#the second one is hidden state f
print(h0.shape)
print(x.shape)
out, _ = self.lstm(x, h0)
print(out.shape)
#output has batch_size, seq_len, hidden size
#we need to decode hidden state only the last time step
#out (N, 30, 128)
#Since we need only the last time step
#Out (N, 128)
out = out[:, -1, :] #-1 for last time step, take all for N and 128
out = self.fc(out)
return out
stacked_lstm_model = LSTM(input_size, hidden_size, num_layers, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()#cross entropy has softmax at output
optimizer = torch.optim.Adam(stacked_lstm_model.parameters(), lr=learning_rate) #optimizer used gradient optimization using Adam
# Train the model
n_total_steps = len(train_dl)
for epoch in range(num_epochs):
t_losses=[]
for i, (images, labels) in enumerate(train_dl):
# origin shape: [8, 1, 300, 300]
# resized: [8, 300, 300]
images = images.reshape(-1, seq_len, input_size).to(device)
labels = labels.to(device)
# Forward pass
outputs = stacked_lstm_model(images)
loss = criterion(outputs, labels)
t_losses.append(loss)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i+1) % 100 == 0:
print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')
avgd_trainloss = sum(t_losses)/len(t_losses)
acc=0
v_losses=[]
with torch.no_grad():
n_correct = 0
n_samples = 0
for v_images, v_labels in valid_dl:
v_images = v_images.reshape(-1, seq_len, input_size).to(device)
v_labels = v_labels.to(device)
v_outputs = stacked_lstm_model(v_images)
v_loss = criterion(v_outputs, v_labels)
v_losses.append(v_loss)
# max returns (value ,index)
_, v_predicted = torch.max(v_outputs.data, 1)
n_samples += v_labels.size(0)
n_correct += (v_predicted == v_labels).sum().item()
acc = 100.0 * n_correct / n_samples
avgd_validloss = sum(v_losses)/len(v_losses)
print (f'Epoch [{epoch+1}/{num_epochs}], Train loss: {avgd_trainloss.item():.4f}, Valid loss: {avgd_validloss.item():.4f}, Valid accu: {acc.item():.2f}')
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
n_correct = 0
n_samples = 0
for images, labels in test_dl:
images = images.reshape(-1, seq_len, input_size).to(device)
labels = labels.to(device)
outputs = stacked_lstm_model(images)
# max returns (value ,index)
_, predicted = torch.max(outputs.data, 1)
n_samples += labels.size(0)
n_correct += (predicted == labels).sum().item()
acc = 100.0 * n_correct / n_samples
print(f'Accuracy of the network on test images: {acc} %')
The LSTM requires two hidden states, not one. So instead of
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
use
h0 = (torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device), torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device))
So you need two hidden states in a tuple.