Here is a simple example that results in an in-place operation error.
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=3,
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
shared_net = SharedNetwork()
net_1 = Network1()
segmentor = Network2()
optimizer = optim.Adam(list(shared_net.parameters()) + list(segmentor.parameters()), lr=1e-6)
optimizer_conf = optim.Adam(list(shared_net.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
optimizer.zero_grad()
optimizer_conf.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
optimizer.step()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
Error message:
UserWarning: Error detected in ConvolutionBackward0. No forward pass information available. Enable detect anomaly during forward pass for more information. (Triggered internally at C:\cb\pytorch_1000000000000\work\torch\csrc\autograd\python_anomaly_mode.cpp:97.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 3, 3, 3]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I was able to solve the problem by changing the order of running the step
function of optimizers.
optimizer_conf.zero_grad()
optimizer.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
optimizer.step()
The following questions, however, remain:
step
method cause an inplace operation in convolution?NOTE: The loss function is used for simplicity, using dice-loss also results in the same error!
Before answering the question, I have to mention that it seems having multiple optimizers for one set of parameters is anti-pattern and it's better to be avoided.
param.weight += param.grad
which can be interpreted as an in place operation
To sum up, it's best to have only one optimizer for one set of parameters, the previous example could coded in the following way:
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=(3,3),
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
torch.manual_seed(0)
shared_net = SharedNetwork()
net_1 = Network1()
net_2 = Network2()
shared_optimizer = optim.Adam(list(shared_net.parameters()), lr=1e-6)
net_1_optimizer = optim.Adam(list(net_1.parameters()), lr=1e-6)
net_2_optimizer = optim.Adam(list(segmentor.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
net_2_optimizer.zero_grad()
features = shared_net(fake_data)
net_2_out = net_2(features)
s_loss = loss_fn(net_2_out, target_data_2)
s_loss.backward(retain_graph=True)
net_2_optimizer.step()
net_1_optimizer.zero_grad()
shared_optimizer.zero_grad()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
net_1_optimizer.step()
shared_optimizer.step()
Note: If you want to have two different learning rates for different losses applied to one set of parameters, you can multiply the losses based on their importance by a value. For example, you can multiply loss_1 by 0.1 and loss_1 by 0.5. Or, you can use backward hooks as mentioned in this comment: backward-hook