Cuda Pytorch Error: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0

I am trying to train a deep neural network on cuda. I found some code and I am trying to make it compatible to cuda. My code is as below:

class UnsupervisedModel(torch.nn.Module):

    def __init__(self):
        super(UnsupervisedModel, self).__init__()

        # Encoder
        encoder_net = get_learning_net("cnn1d_fe",
                                     {"input_channels": 1,
                                      "dropout": 0,
                                      "kernel_size": 3,
                                      "stride": 1,
                                      "mid_channels": 32,
                                      "final_out_channels": 64},
                                     state_dict=None,
                                     freeze=False)
        # Linear Classifier (single layer)
        #classifier_net = get_neural_net(name='LinearNN',
                                        #args={"input_dim": 64,
                                              #"output_dim": 2},
                                        #state_dict=None)

        self.encoder_net = encoder_net
        #self.classifier_net = classifier_net

    def forward(self, x):
        x = self.encoder_net(x)
        #x = self.classifier_net(x)
        return x

def main(args):

    # reproducibility
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.enabled = False
    torch.use_deterministic_algorithms = True

    # set GPU device ID
    device = "cuda:0"
    args.device = device
    logging.info('Torch, using device: %s' % device)
    criter_cl=NTXentLoss(device, args.batch_size, 0.5, True)

    # Data loaders
    #train_loader_1, validation_loader_1, test_loader_1 = prepare_data_loaders(args)
    train_loader, validation_loader, test_loader = prepare_unseen_data_loaders(args)
    from models.helpers import proj_head
    proj_head = proj_head(args.input_length)
    #print(proj_head)
    feature_extractor=UnsupervisedModel()
    temporal_encoder=cnn1d_temporal()
    #projet(feature_extractor)
    from torch import nn
    network = nn.Sequential(feature_extractor, proj_head)
    network.to(device)

    # Model definition
    
    #model = SupervisedModel()
    #model.to(device)
    logging.info(
        'Model initialization, the number of trainable parameters: %d' %
        count_parameters(network))

    # Optimizer and criterion
    optimizer = torch.optim.Adam([
        {"params": network.parameters()}],lr=args.learning_rate, weight_decay=args.weight_decay, betas=(0.9, 0.999), eps=1e-08)
    #optimizer = torch.optim.Adam([
        #{"params": model.encoder_net.parameters()},
        #{"params": model.classifier_net.parameters()}],
        #lr=args.learning_rate, weight_decay=args.weight_decay, betas=(0.9, 0.999), eps=1e-08)
    loss_func_cl=criter_cl
    #loss_function = torch.nn.CrossEntropyLoss()
    #loss_function = torch.nn.CrossEntropyLoss()
    #metrics = {"loss": Loss(loss_func_cl)}
    metrics = {"loss": Average(output_transform=lambda x: x['loss'])}
    #metrics = {"accuracy": Accuracy(), "loss": Loss(criter_cl)}

    # Features
    # 0: ACC_x
    # 1: ACC_y
    # 2: ACC_z
    # 3: ACC_abs
    # 4: BVP
    # 5: EDA
    # 6: TEMP
    #features = {'EDA':5, 'BVP':4, 'All':[4,5,6]}
    features={'EDA':2, 'BVP':1}
    #[3,4,5,6]
    feature_idx = features[args.feature_type]
    # TODO: Later, adapt network input size/archtitecture to use all features.

    def train_step(engine, train_batch):
        loss = 0
        output_dic = {}

        data,data1, data2,_,_ = train_batch
        #print(data1.shape)
        aug1 = data1[:, feature_idx, :].type(torch.float)
        #print(aug1.shape)
        aug1 = torch.unsqueeze(aug1, dim=1)
        #print(aug1.shape)
        aug2 = data2[:, feature_idx, :].type(torch.float)
        aug2 = torch.unsqueeze(aug2, dim=1)
        
        aug1, aug2 = aug1.to(device), aug2.to(device)

        network.train()
        optimizer.zero_grad()
        features1 = feature_extractor(aug1)
        #print(features1.shape)
        z1 = proj_head(features1)
        #print(z1.shape)
        features2 = feature_extractor(aug2)
        z2 = proj_head(features2)
        # normalize projection feature vectors
        z1 = F.normalize(z1, dim=1)
        z2 = F.normalize(z2, dim=1)
        loss = criter_cl(z1, z2)
        #print(loss)
        loss.backward()
        optimizer.step()
        #print(loss.item())
        #loss += los.detach().item()
        output_dic['loss'] = loss.item()
        #print(total_loss)
        #net=[feature_extractor, temporal_encoder, proj_head]

        return output_dic

    def validation_step(engine, val_batch):
        output_val={}

       
        data,data1, data2,_,_ = val_batch
        #print(data1.shape)
        aug1 = data1[:, feature_idx, :].type(torch.float)
        aug1 = torch.unsqueeze(aug1, dim=1)
        #print(aug1.shape)
        aug2 = data2[:, feature_idx, :].type(torch.float)
        aug2 = torch.unsqueeze(aug2, dim=1)
        
        aug1, aug2 = aug1.to(device), aug2.to(device)

        network.eval()
        with torch.no_grad():
             features1 = feature_extractor(aug1)
             z1 = proj_head(features1)
             features2 = feature_extractor(aug2)
             z2 = proj_head(features2)
             z1 = F.normalize(z1, dim=1)
             z2 = F.normalize(z2, dim=1)
             loss = criter_cl(z1, z2)
             output_val['loss'] = loss
            
        return output_val

    # Initialize trainer and evaluators
    trainer = Engine(train_step)
    lr_scheduler = LRScheduler(
        torch.optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=[
                2, 10], gamma=0.1))
    trainer.add_event_handler(Events.EPOCH_STARTED, lr_scheduler)
    trainer.logger = setup_logger("Trainer")

    train_evaluator = Engine(validation_step)
    validation_evaluator = Engine(validation_step)
    
 

   
   
    
    trainer.run(train_loader, max_epochs=args.max_epoch)

    # Evaluate the latest snapshot on the entire WESAD dataset
    # Load trained weights
    weight_files = [p for p in list(pathlib.Path(args.log_dir).rglob('*.pt'))]
    print('Loading trained weights: %s' % weight_files[-1].as_posix())
    network.load_state_dict(torch.load(weight_files[-1].as_posix()))
   
    
    network.to(device)
    logging.info(
        'Model initialization, the number of trainable parameters: %d' %
        count_parameters(network))

    for param in network.parameters():  # freeze baseline
            param.requires_grad = False


    train_loader_1, validation_loader_1, test_loader = verbio_data_loaders(args)
    model = SupervisedModel()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, betas=(0.9, 0.999), eps=1e-08)
    loss_function = torch.nn.CrossEntropyLoss()
    print('loss')
    print(Loss(loss_function))
    metrics = {"accuracy": Accuracy(), "loss": Loss(loss_function)}
    
    def train_step_1(engine, train_batch):

        data, labels, metadata = train_batch
        data, labels = data[:, feature_idx, :], labels.squeeze().long()
        data, labels = data.to(device), labels.to(device)
        
        #print(data.shape)
        data=torch.unsqueeze(data, dim=1)

        network.train()

        # forward pass
        emb = network(data)
        output = model(emb)

        # calculate loss
        loss = loss_function(output, labels)
        

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print(loss.item())

        return loss.item()

    def validation_step_1(engine, val_batch):

        data, labels, metadata = val_batch
        data, labels = data[:, feature_idx, :], labels.squeeze().long()
        data, labels = data.to(device), labels.to(device)
        data=torch.unsqueeze(data, dim=1)

        model.eval()
        with torch.no_grad():
            # forward pass
            
            **emb = network(data)**
            **predicted = model(emb)**

        return predicted, labels
    
    
        # Initialize trainer and evaluators
    loss_function = torch.nn.CrossEntropyLoss()
    metrics = {"accuracy": Accuracy(), "loss": Loss(loss_function)}
    trainer = Engine(train_step_1)
    lr_scheduler = LRScheduler(
        torch.optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=[
                2, 10], gamma=0.1))
    trainer.add_event_handler(Events.EPOCH_STARTED, lr_scheduler)
    trainer.logger = setup_logger("Trainer")

    train_evaluator = Engine(validation_step_1)
    validation_evaluator = Engine(validation_step_1)




   
    # Kick everything off
    trainer.run(train_loader_1, max_epochs=args.max_epoch)
    # Evaluate the latest snapshot on the entire WESAD dataset
    # Load trained weights for supstream task (freezing)
    ###
    weight_files_1 = [p for p in list(pathlib.Path(args.log_dir_1).rglob('*.pt'))]
    #print('Loading trained weights: %s' % weight_files_1[-1].as_posix())
    
    
    network.load_state_dict(torch.load(weight_files[-1].as_posix()))
    network.to(device)
        
    for param in network.parameters():  # freeze baseline
            param.requires_grad = False
 
    
    model = SupervisedModel()
    model.load_state_dict(torch.load(weight_files_1[-1].as_posix()))
    model.to(device)
 
   
    test_labels = np.zeros(shape=(0,), dtype=np.float32)
    test_predictions = np.zeros(shape=(0, 2), dtype=np.float32)
    for iteration, test_batch in enumerate(test_loader):
        data, labels, metadata = test_batch
        data, labels = data[:, 5, :], labels.squeeze().long()
        #print(data.shape)
        data= torch.unsqueeze(data, dim=1)
        #print(data.shape)
        data, labels = data.to(device), labels.to(device)
        #print(labels.shape)

        # forward pass
        with torch.no_grad():
        
           
            
            features= network(data)
            predicted = model(features)
            #print(predicted.shape)

        test_labels = np.concatenate(
            (test_labels, labels.cuda().numpy()), axis=0)
        test_predictions = np.concatenate(
            (test_predictions, predicted.cuda().numpy()), axis=0)

Any help would be appreciated. I tried to send all data and labels to device (which is cuda:0) but apparently some remains in cpu. I am newbie in pytorch sorry if it is too obvious.

Solution

I can see that model variable is not being copied to cuda device near train_step_1 function definition. I am not sure why is everything being redefined here exactly, but the following may solve the issue:

train_loader_1, validation_loader_1, test_loader = verbio_data_loaders(args)
model = SupervisedModel()
model.to(device) # send the model to device here

optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, betas=(0.9, 0.999), eps=1e-08)
loss_function = torch.nn.CrossEntropyLoss()