Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0

I cannot understand why this error keeps popping out. I start specifying the device variable

if torch.cuda.is_available():
    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()
    torch.cuda.get_device_name(0)
else:
    device = torch.device("cpu")

The model is the following

class CNN(nn.Module):
def __init__(self, initial_num_channels, num_channels):
    '''
    Args:
    initial_num_channels (int): size of the incoming feature vector 
    num_classes (int): size of the output prediction vector 
    num_channels (int): constant channel size to use throughout network
    '''
    super(CNN, self).__init__()
    self.convnet = nn.Sequential( 
        
        nn.Conv1d(in_channels=initial_num_channels,
                  out_channels=num_channels, kernel_size=3),

        nn.ELU(),

        nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
                  kernel_size=3, stride=2),

        nn.ELU(), 
        
        nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
                  kernel_size=3, stride=2),

        nn.ELU(), 
        
        nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
                  kernel_size=3),

        nn.ELU() )
    
    
def forward(self, x, apply_softmax=False):

    """The forward pass of the classifier
    
    Args:
        x (torch.Tensor): an input data tensor. 
            x.shape should be (batch, dataset._max_seq_length)
        apply_softmax (bool): a flag for the softmax activation
            should be false if used with the Cross Entropy losses
    Returns:
        the resulting tensor. tensor.shape should be (batch, num_classes)
    """
    
    # input tensor: batch_size x channels x signal_length
    x = self.convnet(x)

    # average through maxpooling
    x = F.avg_pool1d(x, kernel_size=3)
    x = F.dropout(x, p=0.1)

    # go back to 2 dimensions: batch_size x features
    x = x.view(x.size(dim=0), -1)

    # compute the input size of linear combination layer
    num_features = x.size(dim=1)

    # final linear comb layers
    self.fc1 = nn.Linear(num_features, 100)
    self.fc2 = nn.Linear(100, 2) 
    
    # mlp classifier
    x = F.relu(F.dropout(self.fc1(x), p=0.1))
    x = self.fc2(x)

    if apply_softmax:
        x = F.softmax(x, dim=1)

    return x

Here I move it onto the GPU

epochs = 10
model = CNN(initial_num_channels=1, num_channels=256)
# model = MultilayerPerceptron(input_dim=MAX_LEN, hidden_dim=100, output_dim=2)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Finally I run the training loop

train_loss_set = []
executed = False

for epoch in range(1, epochs + 1):
    train_loop = tqdm(train_dataloader)

    model.train()

    train_steps = 0
    train_loss = 0

    for (idx,(train_input, train_label)) in enumerate(train_loop):

        # add dimension=1 in position 1 to have channels=1
        # ONLY FOR CNN
        train_input = torch.unsqueeze(train_input, 1)

        train_input = train_input.clone().detach().requires_grad_(True).to(device)
        train_label = train_label.clone().detach().to(device)
        train_output = model(train_input)
        loss = criterion(train_output, train_label)
        loss.backward()
        train_loss_set.append(loss.item())
        optimizer.step()
        model.zero_grad()

        train_steps += 1
        train_loss += loss.item()
        train_loss_set.append(loss.item())

    print("Train loss: {}".format(train_loss/train_steps))

The error is triggered during the forward pass of the model, hence I assume, the problem is related to the input. I tried either the classic assignment input = input.to(device) and also the "fancier" one (which you can see above)

Solution

The problem is with the fc1 and fc2 layers which you create in your forward function. These are only created when the model is called on an input, and are not moved to the GPU. To fix this, you can use:

self.fc1 = nn.Linear(num_features, 100, device=x.device)
self.fc2 = nn.Linear(100, 2, device=x.device)

The definition of fc2 could also be moved to the __init__ function to avoid recreating it each time.