I have troubles reproducing this Pytorch tutorial.
The model they introduce is :
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(combined)
output = self.softmax(output)
return output, hidden
def initHidden(self, batch_size):
return torch.zeros(batch_size, self.hidden_size, dtype=torch.float32, requires_grad=True)
This model reproduces what is happening inside a RNN cell.
While coding, I had troubles with the gradient inside the model.
The code reproducing the issue is the following :
import torch
import torch.nn as nn
# Toy data to reproduce the issue
toy_data_batch = torch.tensor([[0, 1], [1, 0], [1, 0]], dtype=torch.float32)
toy_label_batch = torch.tensor([2, 0, 3], dtype=torch.long)
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(combined)
output = self.softmax(output)
return output, hidden
def initHidden(self, batch_size):
return torch.zeros(batch_size, self.hidden_size, dtype=torch.float32, requires_grad=True)
# Model initialization
input_size = 2
hidden_size = 2
output_size = 4 # Targets in {0, 1, 2, 3}
batch_size = 3 # 3 data points in the batch
learning_rate = 5e-3
rnn = RNN(input_size, hidden_size, output_size)
hidden = rnn.initHidden(batch_size) # init hidden layer with zeros
# Negative log likelihood as it is classification
criterion = nn.NLLLoss()
# Forward pass
output, hidden = rnn(toy_data_batch, hidden)
#output, hidden = rnn(toy_data_batch, hidden) ### BUG: if I remove the comment here, It works
# Loss computation
loss = criterion(output, toy_label_batch)
# Backward pass
loss.backward()
print(rnn.i2o.weight.grad) # This one is fine
print(rnn.i2h.weight.grad) # This one isn't (has type None)
# This will fail, because of the None gradient
for weight in rnn.parameters():
weight.data.add_(weight.grad.data, alpha=-learning_rate)
The output is :
tensor([[-0.1892, 0.0462, 0.0000, 0.0000],
[ 0.1274, 0.1133, 0.0000, 0.0000],
[ 0.1455, -0.2525, 0.0000, 0.0000],
[-0.0837, 0.0930, 0.0000, 0.0000]])
None
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-3-5f02113fddf6> in <module>
54 # This will fail, because of the None gradient
55 for weight in rnn.parameters():
---> 56 weight.data.add_(weight.grad.data, alpha=-learning_rate)
AttributeError: 'NoneType' object has no attribute 'data'
I've noticed that if I uncomment the line
output, hidden = rnn(toy_data_batch, hidden)
#output, hidden = rnn(toy_data_batch, hidden) ### BUG: if I remove the comment here, It works
everything works without a problem. It seems to me that there is a problem with the initialization of the variable hidden. As I've turned on the argument 'requires_grad', I don't know what to do.
Thank you in advance, any help will be appreciated
self.i2h
has no gradient because it's not used in the first step of your model. When you back propagate, your model only uses self.i2o
in the first stage, so self.i2h
has no effect on the output. However, when you get to the second step, it utilises a hidden
which has been calculated using self.i2h
, so therefore there is a traceable gradient through that layer.