Search code examples
pythonpytorchdatasetdataloader

Setting random seed in Torch dataloader


I'm trying to get the torch dataloader to load the data under a specific sequence determined by the random seed 1. Here's my code:

import random
import torch.utils.data.dataset as Dataset
import torch.utils.data.dataloader as DataLoader
from torch.utils.data.sampler import Sampler


class MyDataset(Dataset.Dataset):
    def __init__(self):
        self.Data = [x for x in range(10)]
        self.Label = [x for x in range(10)]
    def __getitem__(self, index):
        data = self.Data[index]
        label = self.Label[index]
        return data, label
    def __len__(self):
        return len(self.Data)

class RandSeqSampler(Sampler):
    def __init__(self, data_source):
        super().__init__(data_source)
        self.data_source = data_source

    def __iter__(self):
        indices = list(range(len(self.data_source)))
        random.shuffle(indices)
        return iter(indices)

    def __len__(self):
        return len(self.data_source)


random.seed(1)
dataset = MyDataset()
dataloader = DataLoader.DataLoader(dataset=dataset, batch_size=1, sampler=RandSeqSampler(dataset))
for i, (data, label) in enumerate(dataloader):
    print(data, label)
print("\n\n\n\n\n")
for i, (data, label) in enumerate(dataloader):
    print(data, label)

random.seed(1)
a = [x for x in range(10)]
random.shuffle(a)
print(a)
random.shuffle(a)
print(a)

The output is

tensor([6]) tensor([6])
tensor([8]) tensor([8])
tensor([9]) tensor([9])
tensor([7]) tensor([7])
tensor([5]) tensor([5])
tensor([3]) tensor([3])
tensor([0]) tensor([0])
tensor([4]) tensor([4])
tensor([1]) tensor([1])
tensor([2]) tensor([2])






tensor([4]) tensor([4])
tensor([8]) tensor([8])
tensor([2]) tensor([2])
tensor([6]) tensor([6])
tensor([5]) tensor([5])
tensor([9]) tensor([9])
tensor([0]) tensor([0])
tensor([7]) tensor([7])
tensor([1]) tensor([1])
tensor([3]) tensor([3])
[6, 8, 9, 7, 5, 3, 0, 4, 1, 2]
[5, 1, 9, 0, 3, 2, 6, 4, 8, 7]

You can see that the dataloader load data in the same order as the shuffled order in the first iteration (both 6, 8, 9, 7, 5, 3, 0, 4, 1, 2), but the data loaded the second iteration follows a different order than the shuffled order ([4,8,2,6,5,9,0,7,1,3] and [5, 1, 9, 0, 3, 2, 6, 4, 8, 7]). I would like the data loaded follow the same order as the shuffled order, which means instead of loading [4,8,2,6,5,9,0,7,1,3], I would like to load [5, 1, 9, 0, 3, 2, 6, 4, 8, 7]. Any ideas of how to achieve that? Any help is appreciated


Solution

  • You get different results because you are running different code.

    In your sampling, you first create an ordered list of indices via indices = list(range(len(self.data_source))), then you shuffle them.

    With your list example, you shuffle the same list twice.

    In the dataloader, the first shuffle maps [0, 1, 2, ...] -> [6, 8, 9, ...]. The second shuffle maps [0, 1, 2, ...] -> [4, 8, 2, ...].

    In your list example, the first shuffle maps [0, 1, 2, ...] -> [6, 8, 9, ...]. The second shuffle maps [6, 8, 9, ...] -> [5, 1, 9, ...].

    The difference is due to the sampler shuffle always starting with an ordered list, while the list example shuffles a shuffled list a second time.

    You can reproduce the sampler results by starting with an ordered list:

    random.seed(1)
    a = [x for x in range(10)]
    random.shuffle(a)
    print(a)
    a = [x for x in range(10)]
    random.shuffle(a)
    print(a)
    
    > [6, 8, 9, 7, 5, 3, 0, 4, 1, 2]
    > [4, 8, 2, 6, 5, 9, 0, 7, 1, 3]
    

    Similarly you can get the sampler to mimic the list example by not resetting the index order every epoch

    class RandSeqSampler(Sampler):
        def __init__(self, data_source):
            super().__init__(data_source)
            self.data_source = data_source
            self.indices = list(range(len(self.data_source)))
    
        def __iter__(self):
            random.shuffle(self.indices)
            return iter(self.indices)
    
        def __len__(self):
            return len(self.data_source)
    
    ...
    
    random.seed(1)
    dataset = MyDataset()
    dataloader = DataLoader.DataLoader(dataset=dataset, batch_size=1, sampler=RandSeqSampler(dataset))
    for i, (data, label) in enumerate(dataloader):
        print(data, label)
    print("\n")
    for i, (data, label) in enumerate(dataloader):
        print(data, label)
    
    tensor([6]) tensor([6])
    tensor([8]) tensor([8])
    tensor([9]) tensor([9])
    tensor([7]) tensor([7])
    tensor([5]) tensor([5])
    tensor([3]) tensor([3])
    tensor([0]) tensor([0])
    tensor([4]) tensor([4])
    tensor([1]) tensor([1])
    tensor([2]) tensor([2])
    
    
    tensor([5]) tensor([5])
    tensor([1]) tensor([1])
    tensor([9]) tensor([9])
    tensor([0]) tensor([0])
    tensor([3]) tensor([3])
    tensor([2]) tensor([2])
    tensor([6]) tensor([6])
    tensor([4]) tensor([4])
    tensor([8]) tensor([8])
    tensor([7]) tensor([7])
    

    Now that said, if you are relying on random seeds to produce the same results from different code, I can guarantee that will become a source of pain and misery and you probably shouldn't do that.