I created a custom dataset for object detection named ReceiptDataset as below.
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
class ReceiptDataset(torch.utils.data.Dataset):
def __init__(self, train_dir,width,height,labels,transforms=None):
self.images = os.listdir(train_dir)
self.width = width
self.height = height
self.train_dir = train_dir
self.labels = labels
self.transforms = transforms
def __getitem__(self,idx):
img_name = self.images[idx]
img_path = os.path.join(self.train_dir,img_name)
#print(f"img_name: {img_name}")
img = cv2.imread(img_path)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
img_res = cv2.resize(img_rgb,(self.width,self.height), cv2.INTER_AREA)
img_res /= 255.0
annot = self.labels[str(img_name)]
lbls = []
boxes = []
target = {}
ht, wt, _ = img.shape
#print(f"img_res shape: {img_res.shape}, orig shape: {wt}, {ht}")
for item in annot:
x,y,box_wt,box_ht,lbl = item
x_min = x
x_max = x + box_wt
y_min = y
y_max = y + box_ht
x_min_corr = (x_min / wt) * self.width
x_max_corr = (x_max /wt ) * self.width
y_min_corr = (y_min / ht) * self.height
y_max_corr = (y_max / ht) * self.height
boxes.append([x_min_corr, y_min_corr, x_max_corr, y_max_corr])
lbls.append( classes.index(str(lbl)) )
#print(f"dls_lbls: {lbls}, {len(lbls)}")
#lbls += [-1] * (NUM_CLASSES - len(lbls))
boxes = torch.as_tensor(boxes, dtype=torch.float32)
lbls = torch.as_tensor(lbls, dtype=torch.int64)
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)
target["boxes"] = boxes
target["labels"] = lbls
target["image_id"] = torch.as_tensor(idx)
target["area"] = area
target["iscrowd"] = iscrowd
#print(f"dls_lbls -- 2: {target['labels']}, { target['labels'].shape }")
if self.transforms:
trans = self.transforms(image=img_res,
bboxes = target["boxes"],
img_res = trans["image"]
target["boxes"] = torch.Tensor(trans["bboxes"])
return img_res, target
def __len__(self):
return len(self.images)
and I created an instance with:
train_dataset = ReceiptDataset("label-detector/images",width,height,plabels)
and my training snippet is :
from engine import train_one_epoch, evaluate
for epoch in range(num_epochs):
but anytime I run the training loop, I’m getting a runtime error:
RuntimeError: stack expects each tensor to be equal size, but got [11,4] at entry 0 and [9,4] at entry 1
There are 17 classes in total and each image has a minimum of 4 annotations. I noticed the problem seems to be coming from my labels list/tensor in the dataset class, the size of the labels list/tensor varies based on the number of annotated items in an image, but I can’t seem to figure out a way to fix this.
Thank you!
I solved it by implementing a custom collate function for the dataloader that returns a batch of my dataset as needed by my model.
def collate_fn_seq(batch):
images = [ item[0] for item in batch ]
targets = [ item[1] for item in batch ]
imgs = []
for image in images:
img = torch.from_numpy(image).permute(2, 0, 1)
boxes = [target["boxes"] for target in targets]
labels = [target["labels"] for target in targets]
image_ids = [ target["image_id"] for target in targets ]
areas = [target["area"] for target in targets]
iscrowds = [target["iscrowd"] for target in targets]
tars = []
for i in range(len(batch)):
box = boxes[i]
label = labels[i]
image_id = image_ids[i]
area = areas[i]
iscrowd = iscrowds[i]
target = {"boxes": box, "labels": label, "image_id": image_id, "area": area, "iscrowd": iscrowd}
return imgs, tars
and included it in my dataloaders using:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_seq)