I tried to implement a simple gradient descent without using an optimizer but the following MLP doesn't train as the loss is always around 2.30. I can't figure out what's wrong with my code . Any advice on what I am doing wrong will be appreciated.(Sorry if the structure of the code is weird, it was implemented in a notebook)
import torch
import numpy as np
import torchvision.datasets as datasets
from torchvision.transforms import ToTensor
# download train and test set
train = datasets.MNIST(root='.data', train=True, download=True, transform=ToTensor())
test = datasets.MNIST(root='.data', train=False, download=True, transform=ToTensor())
# calculate mean and std
mean = train.data.double().mean()
std = train.data.double().std()
print(f"Mean: {mean}, std: {std}")
# standardize data
train.data = (train.data - mean) / std
test.data = (test.data - mean) / std
from torch.utils.data import DataLoader
train_loader1 = DataLoader(train, shuffle=True, batch_size=1)
train_loader2 = DataLoader(train, shuffle=True, batch_size=256)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
import torch.nn as nn
class SimpleNN(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten() # flatten the 28x28 images to 784 features vectors
self.in_to_hidden = nn.Linear(784, 784) # input to 784 neurons hidden layer
self.hidden_to_out = nn.Linear(784, 10) # 784 to 10 neuron output layer
self.softmax = nn.Softmax(dim=1) # make the output into a probability distribution
def forward(self, x):
activation_fn = nn.Tanh()
x = self.flatten(x)
x = activation_fn(self.in_to_hidden(x))
x = self.softmax(self.hidden_to_out(x))
return x
def train_single_epoch(self, data_loader, loss_fn, lr, device):
for x, y in data_loader:
x, y = x.to(device), y.to(device)
# calculate loss
pred = self(x)
loss = loss_fn(pred, y)
# backpropagate error and update weights
loss.backward()
with torch.no_grad(): # the gradient mustn't be calculated for the weight updates
for weights in self.parameters():
weights -= lr * weights.grad
weights.grad.zero_()
print(f"loss: {loss.item()}")
net = SimpleNN().to(device)
for i in range(2):
net.train_single_epoch(train_loader2, nn.CrossEntropyLoss(), 0.001, device)
My first observation is that you're printing the wrong loss value. You are printing the last batch loss at the end of each epoch, which doesn't represent the model's performance for the given epoch.
The right way would be to accumulate batches' losses and then compute the average accros all batches at the end of the epoch.
I modified your train_single_epoch
method to print the right epoch loss. I also added an attribute self.list_losses
to record the losses and plot them later :
class SimpleNN(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten() # flatten the 28x28 images to 784 features vectors
self.in_to_hidden = nn.Linear(784, 784) # input to 784 neurons hidden layer
self.hidden_to_out = nn.Linear(784, 10) # 784 to 10 neuron output layer
self.softmax = nn.Softmax(dim=1) # make the output into a probability distribution
self.list_losses = []
def forward(self, x):
activation_fn = nn.Tanh()
x = self.flatten(x)
x = activation_fn(self.in_to_hidden(x))
x = self.softmax(self.hidden_to_out(x))
return x
def train_single_epoch(self, epoch, data_loader, loss_fn, lr, device):
epoch_losses = []
sample_counts = 0
for x, y in data_loader:
x, y = x.to(device), y.to(device)
# calculate loss
pred = self(x)
loss = loss_fn(pred, y)
epoch_losses.append(loss.item()*y.shape[0])
sample_counts += y.shape[0]
# backpropagate error and update weights
loss.backward()
with torch.no_grad(): # the gradient mustn't be calculated for the weight updates
for weights in self.parameters():
weights -= lr * weights.grad
weights.grad.zero_()
epoch_loss = sum(epoch_losses)/sample_counts
self.list_losses.append(epoch_loss)
print(f"[Epoch]: {epoch+1} \t----\t [loss]: {epoch_loss}")
And then I train for 100 epochs and and plot the losses progress across epochs :
net = SimpleNN().to(device)
for epoch in range(100):
net.train_single_epoch(epoch, train_loader2, nn.CrossEntropyLoss(), 0.001, device)
import matplotlib.pyplot as plt
plt.plot(range(len(net.list_losses)), net.list_losses)
plt.show()
As you can see from the above plot the loss is decreasing, even though quite slowly, but the model is learning.
The model you used here is a toy one (I guess for learning purpose), and that could explain the slow learning we are observing.
There are lots of things you can do to improve learning performance of a neural network (increase model size in terms of depth and layer width, change activation function from tanh to relu, use convolutional network, change learning rate, ...), but that is out of scope of a stackoverflow post.
As an example of improvement I tried to change the learning rate from 0.001
to 0.01
and you can see the model is learning faster now :
net2 = SimpleNN().to(device)
for epoch in range(100):
net2.train_single_epoch(epoch, train_loader2, nn.CrossEntropyLoss(), 0.01, device)
plt.plot(range(len(net2.list_losses)), net2.list_losses)
plt.show()