During training it takes ages to load one batch of data. What can cause this problem? I am new to Pytorch, i had been working with tensorflow for a while, this my first attempt to create something like this. I wrote a custom dataset which gets its images from folders, it gets stored in a dataframe which will be splited it into train val sets.
class CustomDataset(torch.utils.data.Dataset):
def __init__(self, root, split, train_ratio = 0.85, val_ratio = 0.1, transform=None):
self.root = root
self.train_ratio = train_ratio
self.val_ratio = val_ratio
self.test_ratio = 1 - (self.train_ratio + self.val_ratio)
df = self.folder2pandas()
self.split = split
self.data = self.splitDf(df)
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
row = self.data.iloc[[idx]]
x = row.values[0][0]
y = row.values[0][1]
x = cv2.imread(x)
if self.sourceTransform:
x = self.sourceTransform(x)
return x, y
def folder2pandas(self):
tuples = []
for folder, subs, files in os.walk(self.root):
for filename in files:
path = os.path.abspath(os.path.join(folder, filename))
tuples.append((path, folder.split('\\')[-1]))
return pd.DataFrame(tuples, columns=["x", "y"])
def splitDf(self, df):
df = df.sort_values(by=['x'], ascending=True).reset_index(drop=True)
train_idxs = df.loc[range(0, int(self.train_ratio * len(df)))]
val_idxs = df.loc[range(int(self.train_ratio * len(df)),
int(self.train_ratio * len(df)) + int(self.val_ratio * len(df)))]
test_idxs = df.loc[range(
int(self.train_ratio * len(df)) + int(self.val_ratio * len(df)), len(df))]
if self.split == 'train':
return train_idxs
elif self.split == 'val':
return val_idxs
elif self.split == 'test':
return test_idxs
train_transforms = transforms.Compose([
transforms.ColorJitter(brightness=0.3, contrast=0.5, saturation=0.1, hue=0.1),
transforms.GaussianBlur(kernel_size=(5,5), sigma=(0.1, 2.0)),
transforms.Normalize(res[0].numpy(), res[1].numpy()),
val_transforms = transforms.Compose([
transforms.Normalize(res[0].numpy(), res[1].numpy()),
Initializing datasets:
In 'resources' folder there are two folder which name's represents the labels (BinaryClassification).
train_set=CustomDataset(root="resources/",split='train', transform=train_transforms)
val_set=CustomDataset(root="resources/",split='val', transform=val_transforms)
Giving datasets to dataloader:
trainloader = torch.utils.data.DataLoader(train_set, shuffle = True, batch_size=32, num_workers=4)
testloader = torch.utils.data.DataLoader(val_set, shuffle = True, batch_size=32, num_workers=4)
Putting the solution of the comments in a cleaner way:
The creation of several workers was taking large amount of time. It seems that on windows the creation of processes can have weird behaviours in terms of time.
As __getitem__()
is not called, the problem is not in data loading per se, try removing the num_workers
testloader = torch.utils.data.DataLoader(val_set, shuffle = True, batch_size=32)
Then if this works, try increasing it and check the behaviour.