I cannot understand why this error keeps popping out. I start specifying the device variable
if torch.cuda.is_available():
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)
else:
device = torch.device("cpu")
The model is the following
class CNN(nn.Module):
def __init__(self, initial_num_channels, num_channels):
'''
Args:
initial_num_channels (int): size of the incoming feature vector
num_classes (int): size of the output prediction vector
num_channels (int): constant channel size to use throughout network
'''
super(CNN, self).__init__()
self.convnet = nn.Sequential(
nn.Conv1d(in_channels=initial_num_channels,
out_channels=num_channels, kernel_size=3),
nn.ELU(),
nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
kernel_size=3, stride=2),
nn.ELU(),
nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
kernel_size=3, stride=2),
nn.ELU(),
nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
kernel_size=3),
nn.ELU() )
def forward(self, x, apply_softmax=False):
"""The forward pass of the classifier
Args:
x (torch.Tensor): an input data tensor.
x.shape should be (batch, dataset._max_seq_length)
apply_softmax (bool): a flag for the softmax activation
should be false if used with the Cross Entropy losses
Returns:
the resulting tensor. tensor.shape should be (batch, num_classes)
"""
# input tensor: batch_size x channels x signal_length
x = self.convnet(x)
# average through maxpooling
x = F.avg_pool1d(x, kernel_size=3)
x = F.dropout(x, p=0.1)
# go back to 2 dimensions: batch_size x features
x = x.view(x.size(dim=0), -1)
# compute the input size of linear combination layer
num_features = x.size(dim=1)
# final linear comb layers
self.fc1 = nn.Linear(num_features, 100)
self.fc2 = nn.Linear(100, 2)
# mlp classifier
x = F.relu(F.dropout(self.fc1(x), p=0.1))
x = self.fc2(x)
if apply_softmax:
x = F.softmax(x, dim=1)
return x
Here I move it onto the GPU
epochs = 10
model = CNN(initial_num_channels=1, num_channels=256)
# model = MultilayerPerceptron(input_dim=MAX_LEN, hidden_dim=100, output_dim=2)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
Finally I run the training loop
train_loss_set = []
executed = False
for epoch in range(1, epochs + 1):
train_loop = tqdm(train_dataloader)
model.train()
train_steps = 0
train_loss = 0
for (idx,(train_input, train_label)) in enumerate(train_loop):
# add dimension=1 in position 1 to have channels=1
# ONLY FOR CNN
train_input = torch.unsqueeze(train_input, 1)
train_input = train_input.clone().detach().requires_grad_(True).to(device)
train_label = train_label.clone().detach().to(device)
train_output = model(train_input)
loss = criterion(train_output, train_label)
loss.backward()
train_loss_set.append(loss.item())
optimizer.step()
model.zero_grad()
train_steps += 1
train_loss += loss.item()
train_loss_set.append(loss.item())
print("Train loss: {}".format(train_loss/train_steps))
The error is triggered during the forward pass of the model, hence I assume, the problem is related to the input. I tried either the classic assignment input = input.to(device) and also the "fancier" one (which you can see above)
The problem is with the fc1
and fc2
layers which you create in your forward
function. These are only created when the model is called on an input, and are not moved to the GPU. To fix this, you can use:
self.fc1 = nn.Linear(num_features, 100, device=x.device)
self.fc2 = nn.Linear(100, 2, device=x.device)
The definition of fc2
could also be moved to the __init__
function to avoid recreating it each time.