I started switching from Keras to Pytorch and played around with some simple feedforward network today. It is supposed to learn the squaring operation, i.e. f(x) = x^2. However, my network only learns reasonably if I set the batchsize to 1. Any other batchsize yields very poor results. I tried also different learning rates between 1 and 0.0001 to see if this somehow fixed it and also tested a few changes to the network but to no avail. Could anyone tell me what I am doing wrong, i.e. why does my network not learn once I set the batchsize to any value above 1? Find a minimal working example below. Thank you for your help!
import numpy as np
from random import randint
import random
import time
from multiprocessing import Pool
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
class SquareDataset(Dataset):
def __init__(self, num_samples):
super(Dataset, self).__init__()
self.num_samples = num_samples
self.train = [None] * num_samples
self.target = [None] * num_samples
for i in range(0, num_samples):
self.train[i] = random.random() * randint(1, 10)
self.target[i] = self.train[i] ** 2
def __len__(self):
return self.num_samples
def __getitem__(self, index):
return self.train[index], self.target[index]
def trainNetwork(epochs=50):
data_train = SquareDataset(num_samples=1000)
data_train_loader = DataLoader(data_train, batch_size=1, shuffle=False)
model = nn.Sequential(nn.Linear(1, 32),
nn.ReLU(),
nn.Linear(32, 32),
nn.ReLU(),
nn.Linear(32, 1))
# Define the loss
criterion = nn.MSELoss()
# Optimizers require the parameters to optimize and a learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for e in range(epochs):
running_loss = 0
for number, labels in data_train_loader:
optimizer.zero_grad()
number = number.view(number.size(0), -1)
output = model(number.float())
loss = criterion(output, labels.float())
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print(f"Training loss: {running_loss/len(data_train_loader)}")
# some test outputs
sample = torch.tensor([0.2])
out = model(sample.float())
print("Out:")
print(out.item())
sample = torch.tensor([1])
out = model(sample.float())
print("Out:")
print(out.item())
trainNetwork()
On line loss = criterion(output, labels.float())
first tensor has shape (batch_size, 1)
while labels
has shape (batch_size, )
. Hence when batch_size > 1
broadcasting occurs and this lead to wrong objective, case similar to this. To overcome issue rewrite loss line but with equal shapes, like:
loss = criterion(output.squeeze(-1), labels.float())