python pytorch dataloader pytorch-dataloader

Pytorch DataLoader for custom dataset

I am trying to write a custom data loader for a dataset where the directory structures is as follows:

All_data
|
->Numpy_dat
| |
|  -> dat_0
|      -> dat_{0}_{0}.npy
|      .
|      .
| -> dat_1
|      -> dat_{0}_{0}.npy
|      -> dat_{0}_{1}.npy
|      .
|      .
|->mask_numpy
  |
  -> mask_0
     -> mask_{0}_{0}.npy
     -> mask_{0}_{1}.npy
     .
     .
  -> mask_1
     -> mask_{0}_{0}.npy
     -> mask_{0}_{1}.npy
     .
     .

I am new to pytorch and was finding the tutorials difficult to follow for get item of the loader.

Solution

This would look like this:

import numpy
import torch
from torch.utils.data import Dataset
from pathlib import Path


class MyDataset(Dataset):
    def __init__(self, folder: str) -> None:
        super().__init__()

        self.folder = Path(folder).expanduser().resolve()
        self.patches =self.folder.glob("**/dat_*.npy")

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        patch_path = self.patches[index]  # `folder/Numpy_dat/dat_z/dat_x_y.npy`

        patch_id = patch_path.name.split("_", maxsplit=1)[1]  # 'x_y.npy'
        sub_folder_id = patch_path.parent.name.split("_", maxsplit=1)[1]  # 'z'
        mask_path = self.folder

        # `folder/mask_numpy/mask_z/mask_x_y.npy`
        mask_path = self.folder / Path(
            "mask_numpy", f"mask_{sub_folder_id}", f"mask_{patch_id}"
        )

        patch = numpy.load(patch_path)
        mask = numpy.load(mask_path)

        return torch.from_numpy(patch), torch.from_numpy(mask)


def main():
    dataset = MyDataset(folder="All_data")

    for patch, mask in dataset:
        print(patch.shape, mask.shape)


if __name__ == "__main__":
    main()

The key part is to first list all the patch filenames and store them in the dataset object. Then, you implement the __getitem__ method which returns a pair of (patch, mask) tensors. The bulk of the code is to derive the correct mask filename from the patch name. You finally read the Numpy matrices and convert them to PyTorch tensors before returning them.

I suggest you to improve your database structure. You do not need the sub-folders if the image id (the number identifying the image the patch correspond to) is part of the filename. From there, if you simplify further and use the same name for the patch and the mask such as:

data
  patches
    patch_0_0.npy
    patch_0_1.npy
    patch_1_0.npy
    patch_1_1.npy
    ...
  masks
    patch_0_0.npy
    patch_0_1.npy
    patch_1_0.npy
    patch_1_1.npy
    ...

then, this greatly simplifies the dataset logic:

class MyDataset(Dataset):
    def __init__(self, folder: str) -> None:
        super().__init__()

        self.patches_folder = Path(folder, "patches").expanduser().resolve()
        self.masks_folder = Path(folder, "masks").expanduser().resolve()

        self.patches = list(self.patches_folder.glob("*.npy"))

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        patch_path = self.patches[index]
        mask_path = self.masks_folder / patch_path.name
        
        patch = numpy.load(patch_path)
        mask = numpy.load(mask_path)

        return torch.from_numpy(patch), torch.from_numpy(mask)