Search code examples
pytorchbatchsize

Network stops learning once batchsize is set to > 1


I started switching from Keras to Pytorch and played around with some simple feedforward network today. It is supposed to learn the squaring operation, i.e. f(x) = x^2. However, my network only learns reasonably if I set the batchsize to 1. Any other batchsize yields very poor results. I tried also different learning rates between 1 and 0.0001 to see if this somehow fixed it and also tested a few changes to the network but to no avail. Could anyone tell me what I am doing wrong, i.e. why does my network not learn once I set the batchsize to any value above 1? Find a minimal working example below. Thank you for your help!

import numpy as np
from random import randint
import random
import time
from multiprocessing import Pool
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms

class SquareDataset(Dataset):
     def __init__(self, num_samples):
         super(Dataset, self).__init__()
         self.num_samples = num_samples
         self.train  = [None] * num_samples
         self.target = [None] * num_samples
         
         for i in range(0, num_samples):
             self.train[i]  = random.random() * randint(1, 10)
             self.target[i] =  self.train[i] ** 2
             
     def __len__(self):
         return self.num_samples
        
     def __getitem__(self, index):
        return self.train[index], self.target[index]



def trainNetwork(epochs=50):
    data_train = SquareDataset(num_samples=1000)
    data_train_loader = DataLoader(data_train, batch_size=1, shuffle=False)

    model = nn.Sequential(nn.Linear(1, 32),
                      nn.ReLU(),
                      nn.Linear(32, 32),
                      nn.ReLU(),
                      nn.Linear(32, 1))
    # Define the loss
    criterion = nn.MSELoss()
    # Optimizers require the parameters to optimize and a learning rate
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for e in range(epochs):
        running_loss = 0
        for number, labels in data_train_loader:
            optimizer.zero_grad()
            number = number.view(number.size(0), -1)
            output = model(number.float())
            loss = criterion(output, labels.float())
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        else:
            print(f"Training loss: {running_loss/len(data_train_loader)}")
    # some test outputs
    sample = torch.tensor([0.2])
    out = model(sample.float())
    print("Out:")
    print(out.item())
    sample = torch.tensor([1])
    out = model(sample.float())
    print("Out:")
    print(out.item())

trainNetwork()

Solution

  • On line loss = criterion(output, labels.float()) first tensor has shape (batch_size, 1) while labels has shape (batch_size, ). Hence when batch_size > 1 broadcasting occurs and this lead to wrong objective, case similar to this. To overcome issue rewrite loss line but with equal shapes, like:

    loss = criterion(output.squeeze(-1), labels.float())