Search code examples
pythonpytorchlstmgradient-descent

PyTorch LSTM has nan for MSELoss


My model is:

class BaselineModel(nn.Module):
    def __init__(self, feature_dim=5, hidden_size=5, num_layers=2, batch_size=32):
        super(BaselineModel, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size=feature_dim,
                            hidden_size=hidden_size, num_layers=num_layers)

    def forward(self, x, hidden):
        lstm_out, hidden = self.lstm(x, hidden)
        return lstm_out, hidden

    def init_hidden(self, batch_size):
        hidden = Variable(next(self.parameters()).data.new(
            self.num_layers, batch_size, self.hidden_size))
        cell = Variable(next(self.parameters()).data.new(
            self.num_layers, batch_size, self.hidden_size))
        return (hidden, cell)

Training looks like:

train_loader = torch.utils.data.DataLoader(
    train_set, batch_size=BATCH_SIZE, shuffle=True, **params)

model = BaselineModel(batch_size=BATCH_SIZE)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
loss_fn = torch.nn.MSELoss(reduction='sum')

for epoch in range(250):

    # hidden = (torch.zeros(2, 13, 5),
    #           torch.zeros(2, 13, 5))
    # model.hidden = hidden
    for i, data in enumerate(train_loader):
        hidden = model.init_hidden(13)
        inputs = data[0]
        outputs = data[1]

        print('inputs',  inputs.size())
        # print('outputs', outputs.size())

        # optimizer.zero_grad()
        model.zero_grad()

        # print('inputs', inputs)
        pred, hidden = model(inputs, hidden)

        loss = loss_fn(pred, outputs)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        print('Epoch: ', epoch, '\ti: ', i, '\tLoss: ', loss)

I have gradient clipping set already, which seems to be the recommended solution. But after even the first step, I get:

Epoch: 0 i: 0 Loss: tensor(nan, grad_fn=)


Solution

  • I suspect your issue has to do with your outputs / data[1] (it would help if you show examples of your train_set). Running the following piece of code gives no nan, but I forced shape of output by hand before calling the loss_fn(pred, outputs) :

    class BaselineModel(nn.Module):
        def __init__(self, feature_dim=5, hidden_size=5, num_layers=2, batch_size=32):
            super(BaselineModel, self).__init__()
            self.num_layers = num_layers
            self.hidden_size = hidden_size
    
            self.lstm = nn.LSTM(input_size=feature_dim,
                                hidden_size=hidden_size, num_layers=num_layers)
    
        def forward(self, x, hidden):
            lstm_out, hidden = self.lstm(x, hidden)
            return lstm_out, hidden
    
        def init_hidden(self, batch_size):
            hidden = Variable(next(self.parameters()).data.new(
                self.num_layers, batch_size, self.hidden_size))
            cell = Variable(next(self.parameters()).data.new(
                self.num_layers, batch_size, self.hidden_size))
            return (hidden, cell)
    
    model = BaselineModel(batch_size=32)
    optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
    loss_fn = torch.nn.MSELoss(reduction='sum')
    
    hidden = model.init_hidden(10)
    model.zero_grad()
    pred, hidden = model(torch.randn(2,10,5), hidden)
    pred.size() #torch.Size([2, 10, 5])
    outputs = torch.zeros(2,10,5)
    
    loss = loss_fn(pred, outputs)
    loss
    
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    print(loss)
    

    Please note a common reason for nan values can be related to numerical stability of your learning phase, but usually you have values for the first steps before you see the divergence happening, which is apparently not the case here.