How to train network on images of different sizes Pytorch

I am trying to feed the Neural network dataset of images and I am getting this error I don't know what might be the cause as all the images have different sizes I have also tried to change batch sizes and kernels but I had no success with this.

 File "c:\Users\david\Desktop\cs_agent\main.py", line 49, in <module>
    for i, data in enumerate(train_loader, 0):
  File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 530, in __next__
    data = self._next_data()
  File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 570, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 52, in fetch
    return self.collate_fn(data)
  File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\collate.py", line 172, in default_collate
    return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
  File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\collate.py", line 172, in <listcomp>
    return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
  File "C:\Users\david\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\collate.py", line 138, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [3, 300, 535] at entry 0 and [3, 1080, 1920] at entry 23

this is my main file


import numpy as np
import matplotlib.pyplot as plt
import torch
import dataset 
import os 
from torch.utils.data import  DataLoader
import torch.nn as nn

import torchvision
import check_device

import neural_network
import torch.optim as optim

EPS = 1.e-7
LR=0.5
WEIGHT_DECAY=0.5
batch_size =50
#DATA LOADING ###################################################################################################################



test_dataset =dataset.csHeadBody(csv_file="images\\test_labels.csv",root_dir="images\\test")
train_dataset =dataset.csHeadBody(csv_file="images\\train_labels.csv",root_dir="images\\train")
train_loader =DataLoader(dataset =train_dataset,batch_size=batch_size,shuffle=True)
test_loader =DataLoader(dataset=test_dataset,batch_size=batch_size,shuffle=True)




#DATA LOADING ###################################################################################################################END


#NEURAL NET #####################################################################################################################################################

net=neural_network.Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)


#NEURAL NET END ######################################################################################



for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        print(data)
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

and this is my dataset file

class csHeadBody(Dataset):
    def __init__(self, csv_file, root_dir, transform=None, target_transform=None):
        self.img_labels = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.img_labels.iloc[idx, 0])
        
        image = read_image(img_path)
        label = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

this is my neural network architecture

import torch.nn.functional as F
import torch.nn as nn
import torch


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 535, 535)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Solution

TLDR: The error was due to different input sizes passed to collate_fn. If you need to use different input sizes then you must deal with that in transforms (e.g. SmallestMaxSize + Pad or LongestMaxSize + Crop) or write a custom batch sampler.

The error was actually not related to the neural network architecture. It was related to the fact that you tried to create a batch composed of images of different sizes (the error said: stack expects each tensor to be equal size, but got [3, 300, 535] at entry 0 and [3, 1080, 1920] at entry 23). When you use torch.utils.data.DataLoader with batch_size and shuffle specified (other args default) then the default sampler and collate_fn are used (more here). The default collate_fn tries to stack the data that comes from the dataset __getitem__ method. In your case, the __getitem__ returned images of different sizes ([3, 300, 535] and [3, 1080, 1920]) and passed those inputs to torch.stack, where the error was thrown.

There are 2 common possibilities to deal with multiple input size case:

Create a proper transform pipeline, which ensures that inputs of same sizes will be returned. For example:

SmallestMaxSize followed by Pad - will resize and add extra pixels
LongestMaxSize followed by Crop - will resize and crop the image

Use custom batch sampler (passed to DataLoader) to ensure that in a single batch only images of same size are loaded.

The second option is a bit harder to implement, but in my opinion it is better since we do not add or remove any information from the image (especially good for generative AI purposes). To implement this option we need to store image sizes information (for each loaded filepath) in the dataset class. Then in custom sampler we will use that information to create batches with indices related to only the images of same size.

Dataset class

from typing import Callable
import torchvision.datasets
import imagesize
from PIL import Image

class FilepathsDataset(torchvision.datasets.VisionDataset):
    def __init__(
        self,
        image_filepaths: list[str],
        transform: Callable,
    ):
        self.image_filepaths = image_filepaths

        imgsz_idxs = {}
        for idx, path in enumerate(image_filepaths):
            # imagesize package loads only the file headers so it works really fast
            width, height = imagesize.get(path) 
            size = (width, height)
            if size not in imgsz_idxs:
                imgsz_idxs[size] = [idx]
            else:
                imgsz_idxs[size].append(idx)

        self.imgsz_idxs = imgsz_idxs
        self.transform = transform


    def get_raw_data(self, idx: int) -> Image.Image:
        """Return raw image"""
        image_filepath = self.image_filepaths[idx]
        image = Image.open(image_filepath).convert("RGB")
        return image

    def __len__(self) -> int:
        return len(self.image_filepaths)

    def __getitem__(self, idx: int) -> tuple[Tensor, Tensor]:
        """Return transformed image and mask"""
        target = <some_target> # TODO: load your target here
        image = self.get_raw_data(idx)
        image = self.transform(image)
        return image, target

Custom Sampler class

This sampler shuffles indices and drops last batch by default (to make sure all batches have the same number of samples)

import copy
import random
from torch.utils.data import Sampler

class SameResolutionSampler(Sampler):
    def __init__(self, batch_size: int, imgsz_idxs: dict[tuple[int, int], list[int]]):
        self.imgsz_idxs = imgsz_idxs
        self.batch_size = batch_size

    def __iter__(self) -> list[list[int]]:
        imgsz_idxs = copy.deepcopy(self.imgsz_idxs)

        batches = []
        for size in imgsz_idxs:
            while len(imgsz_idxs[size]) >= self.batch_size:  # drop last
                batch_idxs = []
                for i in range(self.batch_size):
                    idx = imgsz_idxs[size].pop()
                    batch_idxs.append(idx)
                batches.append(batch_idxs)
        random.shuffle(batches)
        return iter(batches)

    def __len__(self):
        batches_per_resolution = [
            len(idxs) // self.batch_size for idxs in self.imgsz_idxs.values()
        ]
        return int(sum(batches_per_resolution))

Create DataLoader instance

filepaths = _  # TODO: paths to your images 
transform = _  # TODO: your transforms
batch_size = _  # TODO: your batch_size

ds = FilepathsDataset(filepaths, transform)
sampler = SameResolutionSampler(batch_size, ds.imgsz_idxs)
dataloader = DataLoader(ds, batch_sampler=sampler)

NOTE: Also if you use the second option (custom sampler) and you train some convolutional neural network with linear head, you must check if the network works with different input sizes or add Adaptive Pooling before the linear layers to ensure that the sizes are correct.