I have a dataset directory that has hundreds of subdirectories, each subdirectory's name is a UUID. Inside each subdirectory, there are four files: an image (png
), a html
, a json
, and a txt
file.
The image, html, and txt form the sample, and the json contains the corresponding label.
Here's the __getitem__()
function of the Dataset
subclass that I defined:
def __getitem__(self, idx):
ID = self.list_IDs[idx]
label = self.labels[ID]
# load the sample
html = open(os.path.join(DATA_PATH, ID, 'html_dirty.html'),
encoding='utf-8').read()
url = open(os.path.join(DATA_PATH, ID, 'url.txt'),
encoding='utf-8').read()
sample = {'html': html, 'url': url}
if self.load_img:
sample['img'] = cv2.imread(os.path.join(DATA_PATH, ID, 'ss.png'))
return sample, label
But when I run:
x = CustomDataset(partitions['train'], labels) # partitions['train'] is just a list of UUIDs
train_generator = DataLoader(x, batch_size=32)
for i, batch in enumerate(train_generator):
print(i)
It errors out. Here's the full stack trace:
Traceback (most recent call last):
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\data_loader.py", line 67, in <module>
for i, batch in train_generator:
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\dataloader.py", line 628, in __next__
data = self._next_data()
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\dataloader.py", line 671, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\fetch.py", line 61, in fetch
return self.collate_fn(data)
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 271, in default_collate
return collate(batch, collate_fn_map=default_collate_fn_map)
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 147, in collate
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 147, in <listcomp>
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 132, in collate
return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 132, in <dictcomp>
return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 155, in collate
raise TypeError(default_collate_err_msg_format.format(elem_type))
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>
Which is weird because:
any([x.__getitem__(i) == None for i in range(32)])
Returns False
.
I resolved it. Apparently, some keys in the jsons had None
value, default_collate
doesn't like that.
Interestingly, while trying to create an MRE, I came up with this:
from torch.utils.data import Dataset, DataLoader
class testdataset(Dataset):
def __init__(self):
# generate a list of 10 dicts
self.data = [{'a': str(i), 'b': i+1} for i in range(10)]
self.labels = [{str(i): i} for i in range(10)]
def __len__(self):
return 10
def __getitem__(self, idx):
return self.data[idx], self.labels[idx]
test = testdataset()
x = DataLoader(test, batch_size=2)
for i, batch in enumerate(x):
print(i)
Which ran into:
Traceback (most recent call last):
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\test.py", line 19, in <module>
for i, batch in enumerate(x):
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\dataloader.py", line 628, in __next__
data = self._next_data()
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\dataloader.py", line 671, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\fetch.py", line 61, in fetch
return self.collate_fn(data)
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 271, in default_collate
return collate(batch, collate_fn_map=default_collate_fn_map)
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 147, in collate
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 147, in <listcomp>
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 129, in collate
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 129, in <dictcomp>
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
File "C:\Users\LENOVO\Desktop\Work\ml-project-1\ml_proj_env\lib\site-packages\torch\utils\data\_utils\collate.py", line 129, in <listcomp>
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
KeyError: '0'
Changing the labels to be a list like so:
self.labels = [[str(i), i+1] for i in range(10)]
Resolved the issue. I was almost misled into thinking that labels are expected to be lists which led me to play around with the labels, which eventually led to the solution.