pytorch computer-vision object-detection pytorch-dataloader

Object Detection - RuntimeError: stack expects each tensor to be equal size

I created a custom dataset for object detection named ReceiptDataset as below.

from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

class ReceiptDataset(torch.utils.data.Dataset):
  def __init__(self, train_dir,width,height,labels,transforms=None):
    self.images = os.listdir(train_dir)
    self.width = width
    self.height = height
    self.train_dir = train_dir
    self.labels = labels
    self.transforms = transforms

  def __getitem__(self,idx):
    img_name = self.images[idx]
    img_path = os.path.join(self.train_dir,img_name)

    #print(f"img_name: {img_name}")

    img = cv2.imread(img_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
    img_res = cv2.resize(img_rgb,(self.width,self.height), cv2.INTER_AREA)

    img_res /= 255.0

    annot = self.labels[str(img_name)]

    lbls = []
    boxes = []
    target = {}

    ht, wt, _ = img.shape
    
    #print(f"img_res shape: {img_res.shape}, orig shape: {wt}, {ht}")

    for item in annot:
      x,y,box_wt,box_ht,lbl = item

      x_min = x
      x_max = x + box_wt
      y_min = y
      y_max = y + box_ht

      x_min_corr = (x_min / wt) * self.width
      x_max_corr = (x_max /wt ) * self.width
      y_min_corr = (y_min / ht) * self.height
      y_max_corr = (y_max / ht) * self.height

      boxes.append([x_min_corr, y_min_corr, x_max_corr, y_max_corr])

      lbls.append( classes.index(str(lbl)) )

    #print(f"dls_lbls: {lbls}, {len(lbls)}")

    #lbls += [-1] * (NUM_CLASSES - len(lbls))

    boxes = torch.as_tensor(boxes, dtype=torch.float32)
    lbls = torch.as_tensor(lbls, dtype=torch.int64)

    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

    iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)

    target["boxes"]  = boxes
    target["labels"] = lbls
    target["image_id"] = torch.as_tensor(idx)
    target["area"] = area
    target["iscrowd"] = iscrowd

    #print(f"dls_lbls -- 2: {target['labels']}, { target['labels'].shape }")

    if self.transforms:
      trans = self.transforms(image=img_res,
                              bboxes = target["boxes"],
                              labels=lbls
                              )
      img_res = trans["image"]
      target["boxes"] = torch.Tensor(trans["bboxes"])


    return img_res, target


  def __len__(self):
    return len(self.images)

and I created an instance with:

train_dataset = ReceiptDataset("label-detector/images",width,height,plabels)

and my training snippet is :

from engine import train_one_epoch, evaluate

for epoch in range(num_epochs):
  train_one_epoch(model,optim,train_loader,device,epoch,print_freq=2)

  lr_scheduler.step()

  evaluate(model,test_loader,device)

but anytime I run the training loop, I’m getting a runtime error:

RuntimeError: stack expects each tensor to be equal size, but got [11,4] at entry 0 and [9,4] at entry 1

There are 17 classes in total and each image has a minimum of 4 annotations. I noticed the problem seems to be coming from my labels list/tensor in the dataset class, the size of the labels list/tensor varies based on the number of annotated items in an image, but I can’t seem to figure out a way to fix this.

Thank you!

Solution

I solved it by implementing a custom collate function for the dataloader that returns a batch of my dataset as needed by my model.

def collate_fn_seq(batch):

  images = [ item[0] for item in batch ]
  targets = [ item[1] for item in batch ]

  imgs = []
  for image in images:
    img = torch.from_numpy(image).permute(2, 0, 1)
    imgs.append(img)

  boxes = [target["boxes"] for target in targets]

  labels = [target["labels"] for target in targets]

  image_ids = [ target["image_id"] for target in targets ]
  areas = [target["area"] for target in targets]
  iscrowds = [target["iscrowd"] for target in targets]

  tars = []

  for i in range(len(batch)):
    box = boxes[i]
    label = labels[i]
    image_id = image_ids[i]
    area = areas[i]
    iscrowd = iscrowds[i]

    target = {"boxes": box, "labels": label, "image_id": image_id, "area": area, "iscrowd": iscrowd}
    tars.append(target)
    
    
  return imgs, tars

and included it in my dataloaders using:

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_seq)