I am still new to neural networks and machine learning, and I am having trouble understanding the problem I am getting in PyTorch and how to solve it.
My dataset, once stored in inputs
and outputs
, is a 10000x125x6 array which represents 6 time-dependent variables with 125 time steps, and there is 10000 independent sets. I am attempting to model this with the code below, but I am getting an error when backwards calculating the gradients. I have seen answers involving detach()
'ing or inserting model_opt.zero_grad()
after model_opt.step()
; however, I am not really understanding what is happening to know if these are the correct solutions (or how to get them to work) and am looking for more clarification and help.
Just to clarify what I am intending my code to do: Within train()
, I am manually grouping batches of 100 independent sets from the 10000. From each batch, I get the loss, add it to the loss of all batches from this epoch, and then compute the average loss of the epoch so far. I then use this averaged loss to update the optimizer.
Here is a minimal reproducible example:
from pathlib import Path
import numpy as np
import h5py
import torch
from torch import nn
from torch import optim
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_classes, num_layers, batch_size=1):
super(RNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_classes = num_classes
self.batch_size = batch_size
self.num_layers = num_layers
self.rnn = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, x, device):
hidden = torch.zeros(self.num_layers, self.batch_size, self.hidden_size, dtype=torch.float64).to(device)
cell_state = torch.zeros(self.num_layers, self.batch_size, self.hidden_size, dtype=torch.float64).to(device)
output, (hidden, cell_state) = self.rnn(x, (hidden, cell_state))
output = self.fc(output)
return output, hidden, cell_state
def train(epochs, rnn_model, model_loss, model_opt, inputs, outputs, batch_size, device):
for epoch in range(epochs):
rnn_model.train()
model_opt.zero_grad()
total_loss = 0.0
num_batches = np.ceil(inputs.shape[0]/batch_size)
for batch_i in range(int(num_batches)):
start = batch_i*batch_size
if batch_i == num_batches - 1:
end = inputs.shape[0]
else:
end = (batch_i+1)*batch_size
inp = inputs[start:end, :, :]
target = outputs[start:end, :, :]
out, hidden, cell_state = rnn_model(inp, device)
total_loss += model_loss(out, target)
loss = total_loss/end
loss.backward()
model_opt.step()
return
def main(fname, input_size, hidden_size, output_size, num_layers, batch_size, num_epochs, learning_rate):
data_dir = Path(r'path\to\my\data')
# load data
train_file = data_dir / f'NN_{fname}.h5'
f = h5py.File(train_file, 'r')
inputs = np.swapaxes(np.array(f['series']['input']), 0, 2)
outputs = np.swapaxes(np.array(f['series']['output']), 0, 2)
# Define model, optimizer, and loss
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNN(input_size, hidden_size, output_size, num_layers, batch_size=batch_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_func = nn.MSELoss()
# send data to computation device
inputs = torch.from_numpy(inputs).to(device)
outputs = torch.from_numpy(outputs).to(device)
# pre-training
i_temp = inputs[:, range(25), :]
o_temp = outputs[:, range(25), :]
train(int(num_epochs*0.01), model, loss_func, optimizer, i_temp, o_temp, batch_size, device)
return
if __name__ == '__main__':
torch.set_default_dtype(torch.float64)
input_size = 6
hidden_size = 7
output_size = 6
num_epochs = 2500
batch_size = 100
learning_rate = 0.0001
num_layers = 3
f_name = 'data'
main(f_name, input_size, hidden_size, output_size, num_layers, batch_size, num_epochs, learning_rate)
The below error is produced from loss.backward()
within train()
RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.
Edit: I have added total_loss = total_loss.detach()
immediately after model_opt.step()
, and now it is running without error. However, I would still like to know if this is correct with my intentions stated above.
Adding total_loss = total_loss.detach()
after model_opt.step()
is indeed the solution as clarified by @c p
to properly update the optimizer from the averaged loss after each batch.