Search code examples
python-3.xpytorchtorch

Pytorch reading tensors from file of tensors (stream training from disk)


I have some really big input tensors and I was running into memory issues while building them, so I read them one by one into a .pt file. As I run the script that generates and saves the file, the file gets bigger and bigger, so I am assuming that the tensors are saving correctly. Here is that code:

with open(a_sync_save, "ab") as f:
     print("saved")
     torch.save(torch.unsqueeze(torch.cat(tensors, dim=0), dim=0), f)

I want to read a certain amount of these tensors from the file at a time, because I do not want to run into a memory issue again. When I try to read each tensor saved to the file I can only manage to get the first tensor.

with open(a_sync_save, "rb") as f:
    for tensor in torch.load(f):
        print(tensor.shape)

The output here is the shape of the first tensor, then quits peacfully.


Solution

  • Here is some code that I used to answer this question. A lot of it is specific to what I am doing, but the jist of it can be used by others who are facing the same problem I was.

    def stream_training(filepath, epochs=100):
        """
        :param filepath: file path of pkl file
        :param epochs: number of epochs to run
        """
        def training(train_dataloader, model_obj, criterion, optimizer):
            for j, data in enumerate(train_dataloader, start=0):
                # get the inputs; data is a list of [inputs, labels]
                inputs, labels = data
                inputs, labels = inputs.cuda(), labels.cuda()
                outputs = model_obj(inputs.float())
                outputs = torch.flatten(outputs)
                loss = criterion(outputs, labels.float())
                print(loss)
                # zero the parameter gradients
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model_obj.parameters(), max_norm=1)
                optimizer.step()
    
        tensors = []
        expected_values = []
        model= Model(1000, 1, 256, 1)
        model.cuda()
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.00001, betas=(0.9, 0.99999), eps=1e-08, weight_decay=0.001,
                               amsgrad=True)
        for i in range(epochs):
            with (open(filepath, 'rb')) as openfile:
                while True:
                    try:
                        data_list = pickle.load(openfile)
                        tensors.append(data_list[0])
                        expected_values.append(data_list[1])
                        if len(tensors) % BATCH_SIZE == 0:
                            tensors = torch.cat(tensors, dim=0)
                            tensors = torch.reshape(tensors, (tensors.shape[0], tensors.shape[1], -1))
                            train_loader = make_dataset(tensors, expected_values) # makes a dataloader for the batch that comes in
                            training(train_loader, model, criterion, optimizer)  #Performs forward and back prop
                            tensors = [] # washes out the batch to conserve memory on my computer.
                            expected_values = []
                    except EOFError:
                        print("This file has finished training")
                        break
    

    Here is the model for fun.

    class Model(nn.Module):
        def __init__(self, input_size, output_size, hidden_dim, n_layers):
            super(Model, self).__init__()
            # dimensions
            self.hidden_dim = hidden_dim
            self.n_layers = n_layers
    
            #Define the layers
            #GRU
            self.gru = nn.GRU(input_size, hidden_dim, n_layers, batch_first=True)
            self.fc1 = nn.Linear(hidden_dim, hidden_dim)
            self.bn1 = nn.BatchNorm1d(num_features=hidden_dim)
            self.fc2 = nn.Linear(hidden_dim, hidden_dim)
            self.bn2 = nn.BatchNorm1d(num_features=hidden_dim)
            self.fc3 = nn.Linear(hidden_dim, hidden_dim)
            self.bn3 = nn.BatchNorm1d(num_features=hidden_dim)
            self.fc4 = nn.Linear(hidden_dim, hidden_dim)
            self.bn4 = nn.BatchNorm1d(num_features=hidden_dim)
            self.fc5 = nn.Linear(hidden_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_size)
    
        def forward(self, x):
            x = x.float()
            x = F.relu(self.gru(x)[1])
            x = x[-1,:,:] # eliminates first dim
            x = F.dropout(x, 0.5)
            x = F.relu(self.bn1(self.fc1(x)))
            x = F.dropout(x, 0.5)
            x = F.relu(self.bn2(self.fc2(x)))
            x = F.dropout(x, 0.5)
            x = F.relu(self.bn3(self.fc3(x)))
            x = F.dropout(x, 0.5)
            x = F.relu(self.bn4(self.fc4(x)))
            x = F.dropout(x, 0.5)
            x = F.relu(self.fc5(x))
            return torch.sigmoid(self.output(x))
    
        def init_hidden(self, batch_size):
            hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
            return hidden