python deep-learning pytorch recurrent-neural-network

Trying to understand PyTorch RuntimeError: Trying to backward through the graph a second time

I am still new to neural networks and machine learning, and I am having trouble understanding the problem I am getting in PyTorch and how to solve it.

My dataset, once stored in inputs and outputs, is a 10000x125x6 array which represents 6 time-dependent variables with 125 time steps, and there is 10000 independent sets. I am attempting to model this with the code below, but I am getting an error when backwards calculating the gradients. I have seen answers involving detach()'ing or inserting model_opt.zero_grad() after model_opt.step(); however, I am not really understanding what is happening to know if these are the correct solutions (or how to get them to work) and am looking for more clarification and help.

Just to clarify what I am intending my code to do: Within train(), I am manually grouping batches of 100 independent sets from the 10000. From each batch, I get the loss, add it to the loss of all batches from this epoch, and then compute the average loss of the epoch so far. I then use this averaged loss to update the optimizer.

Here is a minimal reproducible example:

from pathlib import Path
import numpy as np
import h5py
import torch
from torch import nn
from torch import optim


class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers, batch_size=1):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.batch_size = batch_size
        self.num_layers = num_layers
        
        self.rnn = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x, device):
        hidden = torch.zeros(self.num_layers, self.batch_size, self.hidden_size, dtype=torch.float64).to(device)
        cell_state = torch.zeros(self.num_layers, self.batch_size, self.hidden_size, dtype=torch.float64).to(device)

        output, (hidden, cell_state) = self.rnn(x, (hidden, cell_state))

        output = self.fc(output)
        return output, hidden, cell_state


def train(epochs, rnn_model, model_loss, model_opt, inputs, outputs, batch_size, device):
    for epoch in range(epochs):
        rnn_model.train()
        model_opt.zero_grad()
        total_loss = 0.0
        num_batches = np.ceil(inputs.shape[0]/batch_size)
        for batch_i in range(int(num_batches)):
            start = batch_i*batch_size
            if batch_i == num_batches - 1:
                end = inputs.shape[0]
            else:
                end = (batch_i+1)*batch_size
            inp = inputs[start:end, :, :]
            target = outputs[start:end, :, :]

            out, hidden, cell_state = rnn_model(inp, device)
            total_loss += model_loss(out, target)

            loss = total_loss/end
            loss.backward()
            model_opt.step()
    return


def main(fname, input_size, hidden_size, output_size, num_layers, batch_size, num_epochs, learning_rate):
    data_dir = Path(r'path\to\my\data')

    # load data
    train_file = data_dir / f'NN_{fname}.h5'
    f = h5py.File(train_file, 'r')
    inputs = np.swapaxes(np.array(f['series']['input']), 0, 2)
    outputs = np.swapaxes(np.array(f['series']['output']), 0, 2)

    # Define model, optimizer, and loss
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = RNN(input_size, hidden_size, output_size, num_layers, batch_size=batch_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_func = nn.MSELoss()

    # send data to computation device
    inputs = torch.from_numpy(inputs).to(device)
    outputs = torch.from_numpy(outputs).to(device)
    
    # pre-training
    i_temp = inputs[:, range(25), :]
    o_temp = outputs[:, range(25), :]
    train(int(num_epochs*0.01), model, loss_func, optimizer, i_temp, o_temp, batch_size, device)
    return


if __name__ == '__main__':
    torch.set_default_dtype(torch.float64)
    input_size = 6
    hidden_size = 7
    output_size = 6
    num_epochs = 2500
    batch_size = 100
    learning_rate = 0.0001
    num_layers = 3
    f_name = 'data'
    main(f_name, input_size, hidden_size, output_size, num_layers, batch_size, num_epochs, learning_rate)

The below error is produced from loss.backward() within train()

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

Edit: I have added total_loss = total_loss.detach() immediately after model_opt.step(), and now it is running without error. However, I would still like to know if this is correct with my intentions stated above.

Solution

Adding total_loss = total_loss.detach() after model_opt.step() is indeed the solution as clarified by @c p to properly update the optimizer from the averaged loss after each batch.