One hidden layer MLP not training in pytorch

I tried to implement a simple gradient descent without using an optimizer but the following MLP doesn't train as the loss is always around 2.30. I can't figure out what's wrong with my code . Any advice on what I am doing wrong will be appreciated.(Sorry if the structure of the code is weird, it was implemented in a notebook)

import torch
import numpy as np
import torchvision.datasets as datasets
from torchvision.transforms import ToTensor

# download train and test set
train = datasets.MNIST(root='.data', train=True, download=True, transform=ToTensor())
test = datasets.MNIST(root='.data', train=False, download=True, transform=ToTensor())

# calculate mean and std
mean = train.data.double().mean()
std = train.data.double().std()
print(f"Mean: {mean}, std: {std}")

# standardize data
train.data = (train.data - mean) / std
test.data = (test.data - mean) / std

from torch.utils.data import DataLoader

train_loader1 = DataLoader(train, shuffle=True, batch_size=1)
train_loader2 = DataLoader(train, shuffle=True, batch_size=256)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

import torch.nn as nn

class SimpleNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten() # flatten the 28x28 images to 784 features vectors
        self.in_to_hidden = nn.Linear(784, 784) # input to 784 neurons hidden layer
        self.hidden_to_out = nn.Linear(784, 10) # 784 to 10 neuron output layer
        self.softmax = nn.Softmax(dim=1) # make the output into a probability distribution
        
    def forward(self, x):
        activation_fn = nn.Tanh()
        x = self.flatten(x)
        x = activation_fn(self.in_to_hidden(x))
        x = self.softmax(self.hidden_to_out(x))
        return x
    
    def train_single_epoch(self, data_loader, loss_fn, lr, device):
        for x, y in data_loader:
            x, y = x.to(device), y.to(device)

            # calculate loss
            pred = self(x)
            loss = loss_fn(pred, y)

            # backpropagate error and update weights
            loss.backward()
            with torch.no_grad(): # the gradient mustn't be calculated for the weight updates
                for weights in self.parameters():
                    weights -= lr * weights.grad
                    weights.grad.zero_()

        print(f"loss: {loss.item()}")

net = SimpleNN().to(device)
for i in range(2):
    net.train_single_epoch(train_loader2, nn.CrossEntropyLoss(), 0.001, device)

Solution

My first observation is that you're printing the wrong loss value. You are printing the last batch loss at the end of each epoch, which doesn't represent the model's performance for the given epoch.

The right way would be to accumulate batches' losses and then compute the average accros all batches at the end of the epoch.

I modified your train_single_epoch method to print the right epoch loss. I also added an attribute self.list_losses to record the losses and plot them later :

class SimpleNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten() # flatten the 28x28 images to 784 features vectors
        self.in_to_hidden = nn.Linear(784, 784) # input to 784 neurons hidden layer
        self.hidden_to_out = nn.Linear(784, 10) # 784 to 10 neuron output layer
        self.softmax = nn.Softmax(dim=1) # make the output into a probability distribution
        self.list_losses = []
        
    def forward(self, x):
        activation_fn = nn.Tanh()
        x = self.flatten(x)
        x = activation_fn(self.in_to_hidden(x))
        x = self.softmax(self.hidden_to_out(x))
        return x
    
    def train_single_epoch(self, epoch, data_loader, loss_fn, lr, device):
        epoch_losses = []
        sample_counts = 0
        for x, y in data_loader:
            x, y = x.to(device), y.to(device)

            # calculate loss
            pred = self(x)
            loss = loss_fn(pred, y)
            epoch_losses.append(loss.item()*y.shape[0])
            sample_counts += y.shape[0]

            # backpropagate error and update weights
            loss.backward()
            with torch.no_grad(): # the gradient mustn't be calculated for the weight updates
                for weights in self.parameters():
                    weights -= lr * weights.grad
                    weights.grad.zero_()
        
        epoch_loss = sum(epoch_losses)/sample_counts
        self.list_losses.append(epoch_loss)
        print(f"[Epoch]: {epoch+1} \t----\t [loss]: {epoch_loss}")

And then I train for 100 epochs and and plot the losses progress across epochs :

net = SimpleNN().to(device)
for epoch in range(100):
    net.train_single_epoch(epoch, train_loader2, nn.CrossEntropyLoss(), 0.001, device)


import matplotlib.pyplot as plt
plt.plot(range(len(net.list_losses)), net.list_losses)
plt.show()

As you can see from the above plot the loss is decreasing, even though quite slowly, but the model is learning.

The model you used here is a toy one (I guess for learning purpose), and that could explain the slow learning we are observing.

There are lots of things you can do to improve learning performance of a neural network (increase model size in terms of depth and layer width, change activation function from tanh to relu, use convolutional network, change learning rate, ...), but that is out of scope of a stackoverflow post.

As an example of improvement I tried to change the learning rate from 0.001 to 0.01 and you can see the model is learning faster now :

net2 = SimpleNN().to(device)
for epoch in range(100):
    net2.train_single_epoch(epoch, train_loader2, nn.CrossEntropyLoss(), 0.01, device)

plt.plot(range(len(net2.list_losses)), net2.list_losses)
plt.show()