Both PyTorch model and tensor on GPU but getting all tensors not on same device error

I'm using a ConvNet built using PyTorch for inference and getting a Runtime Error in the following line:

outputs = model(X_batch)

The error is

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

My code structure is as follows. I put the model and tensor on the GPU and made a few checks to ensure that they are both on the GPU.

print(args.device) # returns 'cuda'
print(torch.cuda.is_available()) # returns True

model = CNN_MLP(args)
model.to(args.device)

def inference(model, test_dataloader, device='cpu'):
    model.eval()

    metrics = []
    for _, (X_batch, y_batch) in enumerate(tqdm(test_dataloader)):
        X_batch = X_batch.to(args.device)

        print(next(model.parameters()).is_cuda) # returns True
        print(X_batch.is_cuda) # returns True

        outputs = model(X_batch) # Error raised here
        metrics.append(calc_metrics(outputs, y_batch)) # This is a simplified version of the code here

    return aggregate(metrics)

zero_shot_metrics = inference(model, test_dataloader, device=args.device)

Other details:

I'm using a single NVIDIA RTX A4000 GPU, and can confirm this with nvidia-smi.
The torch version is 2.0.1+cu118. nvidia-smi shows the CUDA version is 12.0. I'm not sure if this mismatch is significant, but torch.cuda.is_available() seems to be working fine.
The CNN_MLP class is defined as follows:

import math
import torch

class CNN_MLP(torch.nn.Module):
    """ CNN-MLP with 1 Conv layer, 1 Max Pool layer, and 1 Linear layer. """

    def __init__(self, seq_len=220, embed_size=64, vocab_size=45, pad_index=0, 
                 stride=1, kernel_size=3, conv_out_size=64, hidden_layer_sizes=[128, 64, 32, 8, 1], dropout_rate=0.25):    
        super(CNN_MLP, self).__init__()
        
        # Embedding layer parameters
        self.seq_len = seq_len
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.pad_index = pad_index
        self.hidden_layer_sizes = hidden_layer_sizes
        
        # Conv layer parameters
        self.stride = stride
        self.kernel_size = kernel_size
        self.conv_out_size = conv_out_size
        
        # Misc
        self.dropout_rate = dropout_rate
        
        # Conv Layers
        self.embedding = torch.nn.Embedding(self.vocab_size, self.embed_size, padding_idx=self.pad_index)
        
        self.conv = torch.nn.Conv1d(self.seq_len, self.conv_out_size, self.kernel_size, self.stride)
        self.hidden_act = torch.relu
        self.max_pool = torch.nn.MaxPool1d(self.kernel_size, self.stride)        
        self.flatten = lambda x: x.view(x.shape[0], x.shape[1]*x.shape[2])
        
        # MLP layers
        self.fc_layers = []
        self.hidden_layer_sizes.insert(0, self._linear_layer_in_size())
        for i in range(len(self.hidden_layer_sizes) - 1):
            self.fc_layers.append(torch.nn.Linear(self.hidden_layer_sizes[i], self.hidden_layer_sizes[i+1]))
            self.fc_layers.append(torch.nn.ReLU())
            if self.dropout_rate and i != len(self.hidden_layer_sizes) - 2:
                self.fc_layers.append(torch.nn.Dropout(self.dropout_rate))
        self.fc_layers.append(torch.sigmoid)
        
    def _linear_layer_in_size(self):
        out_conv_1 = ((self.embed_size - 1 * (self.kernel_size - 1) - 1) / self.stride) + 1
        out_conv_1 = math.floor(out_conv_1)
        out_pool_1 = ((out_conv_1 - 1 * (self.kernel_size - 1) - 1) / self.stride) + 1
        out_pool_1 = math.floor(out_pool_1)
                            
        return out_pool_1*self.conv_out_size
    
    def forward(self, x):
        x = self.embedding(x)
        
        x = self.conv(x)
        x = self.hidden_act(x)
        x = self.max_pool(x)

        x = self.flatten(x)
        
        for layer in self.fc_layers:
            x = layer(x)
        
        return x.squeeze()

    def embed(self, x):
        x = self.embedding(x)
        
        x = self.conv(x)
        x = self.hidden_act(x)
        x = self.max_pool(x)

        x = self.flatten(x)

        for i, layer in enumerate(self.fc_layers):
            if i != len(self.fc_layers) - 1:
                x = layer(x)
        
        return x

Solution

In CNN_MLP the fc_layers member needs to be an nn.Module because it contains other modules. When you call .to on your model, .to will be called recursively on all the other members that inhereit nn.Module. fc_layers is a python list (not an nn.Module) so the parameters of these modules are not being transferred to the GPU. The simplest fix would be to replace

        self.fc_layers = []

with

        self.fc_layers = torch.ModuleList()

(see torch.ModuleList for more information).

A better solution would be to make fc_layers an nn.Sequential, and then call it like a normal nn.Module during forward.