python image-processing pytorch tensor motion-blur

Pytorch transfer learning error: The size of tensor a (16) must match the size of tensor b (128) at non-singleton dimension 2

Currently, I'm working on an image motion deblurring problem with PyTorch. I have two kinds of images: Blurry images (variable = blur_image) that are the input image and the sharp version of the same images (variable = shar_image), which should be the output. Now I wanted to try out transfer learning, but I can't get it to work.

Here is the code for my dataloaders:

train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle = True)
validation_loader = torch.utils.data.DataLoader(valid_dataset, 
                                                batch_size=batch_size,
                                                shuffle = False)
test_loader = torch.utils.data.DataLoader(test_dataset, 
                                          batch_size=batch_size,
                                          shuffle = False)

Their shape:

Trainloader - Shape of blur_image [N, C, H, W]:  torch.Size([16, 3, 128, 128])
Trainloader - Shape of sharp_image [N, C, H, W]:  torch.Size([16, 3, 128, 128]) torch.float32
Validationloader - Shape of blur_image [N, C, H, W]:  torch.Size([16, 3, 128, 128])
Validationloader - Shape of sharp_image [N, C, H, W]:  torch.Size([16, 3, 128, 128]) torch.float32
Testloader- Shape of blur_image [N, C, H, W]:  torch.Size([16, 3, 128, 128])
Testloader- Shape of sharp_image [N, C, H, W]:  torch.Size([16, 3, 128, 128]) torch.float32

The way I use transfer learning (I thought that for the 'in_features' I have to put in the amount of pixels):

model = models.alexnet(pretrained=True)
model.classifier[6] = torch.nn.Linear(model.classifier[6].in_features, 128)
device_string = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_string)
model = model.to(device)

The way I define my training process:

# Define the loss function (MSE was chosen due to the comparsion of pixels
# between blurred and sharp images
criterion = nn.MSELoss()

# Define the optimizer and learning rate
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Learning rate schedule - If the loss value does not improve after 5 epochs
# back-to-back then the new learning rate will be:  previous_rate*0.5

#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( 
        optimizer,
        mode='min',
        patience=5,
        factor=0.5,
        verbose=True
    )

def training(model, trainDataloader, epoch):
  """ Function to define the model training
  
    Args:
        model (Model object): The model that is going to be trained.
        trainDataloader (Dataloader object): Dataloader object of the trainset.
        epoch (Integer): Number of training epochs.
  
  """
  # Changing model into trainings mode
  model.train()
  # Supporting variable to display the loss for each epoch
  running_loss = 0.0
  running_psnr = 0.0
  for i, data in tqdm(enumerate(trainDataloader), 
                      total=int(len(train_dataset)/trainDataloader.batch_size)):
    blur_image = data[0]
    sharp_image = data[1]
        
    # Transfer the blurred and sharp image instance to the device
    blur_image = blur_image.to(device)
    sharp_image = sharp_image.to(device)

    # Sets the gradient of tensors to zero
    optimizer.zero_grad()
    outputs = model(blur_image)
    loss = criterion(outputs, sharp_image)

    # Perform backpropagation
    loss.backward()
    # Update the weights 
    optimizer.step()

    # Add the loss that was calculated during the trainigs run
    running_loss += loss.item()

    # calculate batch psnr (once every `batch_size` iterations)
    batch_psnr =  psnr(sharp_image, blur_image)
    running_psnr += batch_psnr

  # Display trainings loss
  trainings_loss = running_loss/len(trainDataloader.dataset)
  final_psnr = running_psnr/int(len(train_dataset)/trainDataloader.batch_size)
  final_ssim = ssim(sharp_image, blur_image, data_range=1, size_average=True)
  print(f"Trainings loss: {trainings_loss:.5f}")
  print(f"Train PSNR: {final_psnr:.5f}")
  print(f"Train SSIM: {final_ssim:.5f}")

  return trainings_loss, final_psnr, final_ssim

And here is my way to start the training:

train_loss  = []
val_loss = []
train_PSNR_score  = []
train_SSIM_score  = []
val_PSNR_score  = []
val_SSIM_score  = []

start = time.time()
for epoch in range(nb_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_epoch_loss = training(model, train_loader, nb_epochs)
    val_epoch_loss = validation(model, validation_loader, nb_epochs)
    train_loss.append(train_epoch_loss[0])
    val_loss.append(val_epoch_loss[0])

    train_PSNR_score.append(train_epoch_loss[1])
    train_SSIM_score.append(train_epoch_loss[2])

    val_PSNR_score.append(val_epoch_loss[1])
    val_SSIM_score.append(val_epoch_loss[2])

    scheduler.step(train_epoch_loss[0])
    scheduler.step(val_epoch_loss[0])
end = time.time()
print(f"Took {((end-start)/60):.3f} minutes to train")

But every time when I want to perform the training I receive the following error:

 0%|          | 0/249 [00:00<?, ?it/s]Epoch 1
-------------------------------
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py:528: UserWarning: Using a target size (torch.Size([16, 3, 128, 128])) that is different to the input size (torch.Size([16, 128])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
  return F.mse_loss(input, target, reduction=self.reduction)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-195-ff0214e227cd> in <module>()
      9 for epoch in range(nb_epochs):
     10     print(f"Epoch {epoch+1}\n-------------------------------")
---> 11     train_epoch_loss = training(model, train_loader, nb_epochs)
     12     val_epoch_loss = validation(model, validation_loader, nb_epochs)
     13     train_loss.append(train_epoch_loss[0])

<ipython-input-170-dfa2c212ad23> in training(model, trainDataloader, epoch)
     25     optimizer.zero_grad()
     26     outputs = model(blur_image)
---> 27     loss = criterion(outputs, sharp_image)
     28 
     29     # Perform backpropagation

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
    526 
    527     def forward(self, input: Tensor, target: Tensor) -> Tensor:
--> 528         return F.mse_loss(input, target, reduction=self.reduction)
    529 
    530 

/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in mse_loss(input, target, size_average, reduce, reduction)
   2926         reduction = _Reduction.legacy_get_string(size_average, reduce)
   2927 
-> 2928     expanded_input, expanded_target = torch.broadcast_tensors(input, target)
   2929     return torch._C._nn.mse_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
   2930 

/usr/local/lib/python3.7/dist-packages/torch/functional.py in broadcast_tensors(*tensors)
     72     if has_torch_function(tensors):
     73         return handle_torch_function(broadcast_tensors, tensors, *tensors)
---> 74     return _VF.broadcast_tensors(tensors)  # type: ignore
     75 
     76 

RuntimeError: The size of tensor a (16) must match the size of tensor b (128) at non-singleton dimension 2

model structure:

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=4096, out_features=4096, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=4096, out_features=128, bias=True)
  )
)

I'm a newbie in terms of using Pytorch (and image deblurring in general) and so I rather confused about the meaning of the error message and how to fix it. I tried to change my parameters and nothing worked. Does anyone have any advice for me on how to solve this problem?

I would appreciate every input :)

Solution

Here your you can't use alexnet for this task. becouse output from your model and sharp_image should be shame. because convnet encode your image as enbeddings you and fully connected layers can not convert these images to its normal size you can not use fully connected layers for decoding, for obtain the same size you need to use ConvTranspose2d() for this task.

your encoder should be:

class ConvEncoder(nn.Module):
    """
    A simple Convolutional Encoder Model
    """

    def __init__(self):
        super().__init__()

        self.conv1 = nn.Conv2d(3, 16, (3, 3), padding=(1, 1))
        self.relu1 = nn.ReLU(inplace=True)
        self.maxpool1 = nn.MaxPool2d((2, 2))

        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=(1, 1))
        self.relu2 = nn.ReLU(inplace=True)
        self.maxpool2 = nn.MaxPool2d((2, 2))

        self.conv3 = nn.Conv2d(32, 64, (3, 3), padding=(1, 1))
        self.relu3 = nn.ReLU(inplace=True)
        self.maxpool3 = nn.MaxPool2d((2, 2))

        self.conv4 = nn.Conv2d(64, 128, (3, 3), padding=(1, 1))
        self.relu4 = nn.ReLU(inplace=True)
        self.maxpool4 = nn.MaxPool2d((2, 2))


    def forward(self, x):
        # Downscale the image with conv maxpool etc.
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)

        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)

        x = self.conv3(x)
        x = self.relu3(x)
        x = self.maxpool3(x)

        x = self.conv4(x)
        x = self.relu4(x)
        x = self.maxpool4(x)

        
        return x

And your decoder should be:

class ConvDecoder(nn.Module):
    """
    A simple Convolutional Decoder Model
    """

    def __init__(self):
        super().__init__()
        self.deconv1 = nn.ConvTranspose2d(256, 128, (2, 2), stride=(2, 2))
        self.relu1 = nn.ReLU(inplace=True)

        self.deconv2 = nn.ConvTranspose2d(128, 64, (2, 2), stride=(2, 2))
        self.relu2 = nn.ReLU(inplace=True)

        self.deconv3 = nn.ConvTranspose2d(64, 32, (2, 2), stride=(2, 2))
        self.relu3 = nn.ReLU(inplace=True)

        self.deconv4 = nn.ConvTranspose2d(32, 16, (2, 2), stride=(2, 2))
        self.relu4 = nn.ReLU(inplace=True)

    
    def forward(self, x):
         # Upscale the image with convtranspose etc.
        x = self.deconv1(x)
        x = self.relu1(x)

        x = self.deconv2(x)
        x = self.relu2(x)

        x = self.deconv3(x)
        x = self.relu3(x)

        x = self.deconv4(x)
        x = self.relu4(x)
        return x

encoder = ConvEncoder()
decoder = ConvDecoder()

You can train your model like that:

    encoder.train()
    decoder.train()

    for batch_idx, (train_img, target_img) in enumerate(train_loader):
        # Move images to device
        train_img = train_img.to(device)
        target_img = target_img.to(device)
        
        # Zero grad the optimizer
        optimizer.zero_grad()
        # Feed the train images to encoder
        enc_output = encoder(train_img)
        # The output of encoder is input to decoder !
        dec_output = decoder(enc_output)
        
        # Decoder output is reconstructed image
        # Compute loss with it and orginal image which is target image.
        loss = loss_fn(dec_output, target_img)
        # Backpropogate
        loss.backward()
        # Apply the optimizer to network by calling step.
        optimizer.step()
    # Return the loss
    return loss.item()

you might want visit this for getting help in your project.