Search code examples
deep-learningpytorchloss-functionmse

MSE with predictions not expected


Here is a regression model where I attempt to predict the y values (outputs) from x values (inputs) . Each class is given a different mean and normalized with l2 normalization:

x_values = sklearn.preprocessing.normalize(x_values, norm="l2")

This may appear as a classification problem attempting to be solved using regression. I'm attempting to understand multiclass regression in PyTorch, as the PyTorch doc gives the following example which suggests multiclass regression is possible:

>>> loss = nn.MSELoss()
>>> input = torch.randn(3, 5, requires_grad=True)
>>> target = torch.randn(3, 5)
>>> output = loss(input, target)
>>> output.backward()

src: https://pytorch.org/docs/master/generated/torch.nn.MSELoss.html

Entire code:

% reset - f

from datetime import datetime
from sklearn import metrics
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
import torch.utils.data as data_utils
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.autograd import Variable
import pandas as pd
import unittest
import time
from collections import Counter
import sklearn

x_values = []
y_values = []
input_size = 17
lr = .1

# Class1
mu, sigma = 0, 0.1  # mean and standard deviation
x_values.append(np.random.normal(mu, sigma, input_size))
x_values.append(np.random.normal(mu, sigma, input_size))
x_values.append(np.random.normal(mu, sigma, input_size))

# Class2
mu, sigma = 5, 0.5  # mean and standard deviation
x_values.append(np.random.normal(mu, sigma, input_size))
x_values.append(np.random.normal(mu, sigma, input_size))
x_values.append(np.random.normal(mu, sigma, input_size))

# Class3
mu, sigma = 10, 1.0  # mean and standard deviation
x_values.append(np.random.normal(mu, sigma, input_size))
x_values.append(np.random.normal(mu, sigma, input_size))
x_values.append(np.random.normal(mu, sigma, input_size))

# Class4
mu, sigma = 15, 1.5  # mean and standard deviation
x_values.append(np.random.normal(mu, sigma, input_size))
x_values.append(np.random.normal(mu, sigma, input_size))
x_values.append(np.random.normal(mu, sigma, input_size))

# Class5
mu, sigma = 20, 2.0  # mean and standard deviation
x_values.append(np.random.normal(mu, sigma, input_size))
x_values.append(np.random.normal(mu, sigma, input_size))
x_values.append(np.random.normal(mu, sigma, input_size))

x_values = sklearn.preprocessing.normalize(x_values, norm="l2")

y_values.append(0)
y_values.append(0)
y_values.append(0)

y_values.append(1)
y_values.append(1)
y_values.append(1)

y_values.append(2)
y_values.append(2)
y_values.append(2)

y_values.append(3)
y_values.append(3)
y_values.append(3)

y_values.append(4)
y_values.append(4)
y_values.append(4)

num_classes = len(y_values)


class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.criterion = torch.nn.MSELoss()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, 100),
            torch.nn.ReLU(),
            torch.nn.Linear(100, 50),
            torch.nn.ReLU(),
            torch.nn.Linear(50, num_classes)
            #                         torch.nn.ReLU()
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)

    def update(self, state, action):
        y_pred = self.model(torch.Tensor(state))
        loss = self.criterion(y_pred, Variable(torch.Tensor(action)))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def predict(self, s):
        with torch.no_grad():
            return self.model(torch.Tensor(s))


def weights_init(m):
    if type(m) == nn.Linear:
        m.weight.data.normal_(0.0, 1)


model = NeuralNet()
model.apply(weights_init)
print('len(states)', len(x_values))

i = 0

for s in range(7000):

    if i == 15:
        i = 0
    x = x_values[i]
    loss_value = model.update(x, y_values)

    if s % 1000 == 0:
        print('loss_value', loss_value)

    i = i + 1

Predicting on the x_values:

[torch.argmax(model.predict(s)) for s in x_values]

returns:

[tensor(14),
 tensor(14),
 tensor(14),
 tensor(14),
 tensor(14),
 tensor(14),
 tensor(14),
 tensor(14),
 tensor(14),
 tensor(14),
 tensor(14),
 tensor(14),
 tensor(14),
 tensor(14),
 tensor(14)]

As I have defined classes with difference means and the final loss value is low (4.7370e-15) I expect the predicted values to be closer to:

[tensor(0)
 tensor(0),
 tensor(0),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(2),
 tensor(2),
 tensor(3),
 tensor(3),
 tensor(3),
 tensor(4),
 tensor(4),
 tensor(4)]

What are the predicted outputs not closed to my expectation?

Have I set up the model incorrectly?


Solution

  • Are you sure you have a regression problem? When we talk about the output being a specific class, a classification problem is usually used regardless of the input.

    Another concept is that you are trying to represent some sort of ordinal categorical variable.

    You can pose the problem in two ways:

    1 - Consider that you have a classification problem.

    class NeuralNet(nn.Module):
        class ExpActivation(nn.Module):
            def __init__(self):
                super().__init__()
    
            def forward(self, x):
                return torch.exp(x)
    
        class BoundedPositiveNumber(nn.Module):
            def __init__(self):
                super().__init__()
                self.max_value = 4
    
            def forward(self, x):
                return self.max_value * torch.sigmoid(x)
    
    
        def __init__(self):
            super(NeuralNet, self).__init__()
            self.criterion = torch.nn.CrossEntropyLoss()
            self.model = torch.nn.Sequential(
                torch.nn.Linear(input_size, 100),
                torch.nn.ReLU(),
                torch.nn.Linear(100, 50),
                torch.nn.ReLU(),
                torch.nn.Linear(50, num_classes),
                torch.nn.Softmax()
            )
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
    
        def update(self, state, action):
            y_pred = self.model(state)
            loss = self.criterion(y_pred, action)
            # print(torch.argmax(y_pred, axis=-1), action, loss)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            return loss
    
        def predict(self, s):
            with torch.no_grad():
                return self.model(torch.Tensor(s))
    
    
    def weights_init(m):
        if type(m) == nn.Linear:
            m.weight.data.normal_(0.0, 1)
    
    
    model = NeuralNet()
    #model.apply(weights_init)
    print('len(states)', len(x_values))
    
    i = 0
    
    x_values = torch.from_numpy(x_values).float()
    y_values = torch.from_numpy(np.array(y_values)).long()
    
    for s in range(700000):
    
        if i == 15:
            i = 0
        x = x_values[i:i+1]
        y = y_values[i:i+1]
        loss_value = model.update(x_values, y_values)
    
        if s % 1000 == 0:
            print('loss_value', loss_value)
    
        i = i + 1
    
    # Example
    f = model.model(x)
    proba = torch.softmax(f) # obtain the probability distribution
    np.argmax(proba.cpu().numpy()) # or np.argmax(f.cpu().numpy()), in this case are equivalent
    

    2 - Consider that you want to get that "number" as a regression and not as a class. You are not looking for a probability distribution but the value directly. In this case, it is not very common but if you want only positive values it is interesting to use the exponential as activation. So you condense -inf, inf to 0, inf.

    class NeuralNet(nn.Module):
        class ExpActivation(nn.Module):
            def __init__(self):
                super().__init__()
    
            def forward(self, x):
                return torch.exp(x)
    
        class AcotatedPositiveNumber(nn.Module):
            def __init__(self):
                super().__init__()
                self.max_value = 4
    
            def forward(self, x):
                return self.max_value * torch.sigmoid(x)
    
    
        def __init__(self):
            super(NeuralNet, self).__init__()
            self.criterion = torch.nn.MSELoss()
            self.model = torch.nn.Sequential(
                torch.nn.Linear(input_size, 100),
                torch.nn.ReLU(),
                torch.nn.Linear(100, 50),
                torch.nn.ReLU(),
                torch.nn.Linear(50, 1),
                NeuralNet.ExpActivation()
            )
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
    
        def update(self, state, action):
            y_pred = self.model(state)
            loss = self.criterion(y_pred, action.unsqueeze(-1))
            # print(torch.round(y_pred.squeeze()).long(), action, loss)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            return loss
    
        def predict(self, s):
            with torch.no_grad():
                return self.model(torch.Tensor(s))
    
    
    def weights_init(m):
        if type(m) == nn.Linear:
            m.weight.data.normal_(0.0, 1)
    
    
    model = NeuralNet()
    #model.apply(weights_init)
    print('len(states)', len(x_values))
    
    i = 0
    
    x_values = torch.from_numpy(x_values).float()
    y_values = torch.from_numpy(np.array(y_values)).float()
    
    for s in range(700000):
    
        if i == 15:
            i = 0
        x = x_values[i:i+1]
        y = y_values[i:i+1]
        loss_value = model.update(x_values, y_values)
    
        if s % 1000 == 0:
            print('loss_value', loss_value)
    
        i = i + 1
    
    # Example
    regresion_value = model.model(x)
    regresion_value.cpu().numpy()