Search code examples
pythonpytorchneural-networknon-linear-regressionperceptron

How to fit a small dataset with a perceptron model using Pytorch


I'm trying to fit a small dataset(just 7x1 size) with a 3-layer perceptron model, but the loss can't converge. I'm fresh to machine learning area, can someone please give me a hint to adjust my code?

import torch
import torch.nn as nn
import torch.nn.functional as F

vec_shape = [7, 1]
x_0 = [500, 1000, 2000, 4000, 5000, 8000, 10000]
y_0 = [1.171467, 1.486507, 11.7738, 34.448421, 75.402871, 225.319848, 492.262426]

# x = torch.tensor(x_0).reshape(vec_shape).float()
x = torch.log(torch.tensor(x_0).reshape(vec_shape).float())
y = torch.tensor(y_0).reshape(vec_shape).float()

class Net(nn.Module):
    def __init__(self,n_input,n_hidden,n_output):
        super(Net,self).__init__()
        self.hidden1 = nn.Linear(n_input,n_hidden)
        self.hidden2 = nn.Linear(n_hidden,n_hidden)
        self.predict = nn.Linear(n_hidden,n_output)
    def forward(self,input):
        out = self.hidden1(input)
        out = F.relu(out)
        out = self.hidden2(out)
        out = torch.sigmoid(out)
        out =self.predict(out)
        return out
    def weight_init(self):
        for op in self.modules():
            if isinstance(op, nn.Linear):
                nn.init.normal_(op.weight.data)
                nn.init.normal_(op.bias.data)

net = Net(1,10,1)
net.weight_init()
# print(net)

optimizer = torch.optim.SGD(net.parameters(),lr = 0.1)
loss_func = torch.nn.MSELoss()

for t in range(500):
    prediction = net(x)
    loss = loss_func(prediction, y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if(t%50 == 0):
        print('Loss = %.4f' % loss.data)

I tried to expand the model or shrink it, but both changes don't work.


Solution

  • Rescaling and normalization is key in machine learning, your setup is pretty good and you apply some rescaling but simply not enough. With the limited amount of datapoints you have, the range is way too large. So just like you do with x_0, apply torch.log to y_0. You can always scale back the predictions after training. Below you can find the adapted code, I changed two things:

    • torch.log on y_0
    • Learning rate to 0.01
    • Number of iterations to 50000
    • Added a print statement to show rescaling of predictions
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    
    vec_shape = [7, 1]
    x_0 = [500, 1000, 2000, 4000, 5000, 8000, 10000]
    y_0 = [1.171467, 1.486507, 11.7738, 34.448421, 75.402871, 225.319848, 492.262426]
    
    # x = torch.tensor(x_0).reshape(vec_shape).float()
    x = torch.log(torch.tensor(x_0).reshape(vec_shape).float())
    y = torch.log(torch.tensor(y_0).reshape(vec_shape).float()) # modified
    
    class Net(nn.Module):
        def __init__(self,n_input,n_hidden,n_output):
            super(Net,self).__init__()
            self.hidden1 = nn.Linear(n_input,n_hidden)
            self.hidden2 = nn.Linear(n_hidden,n_hidden)
            self.predict = nn.Linear(n_hidden,n_output)
        def forward(self,input):
            out = self.hidden1(input)
            out = F.relu(out)
            out = self.hidden2(out)
            out = torch.sigmoid(out)
            out =self.predict(out)
            return out
        def weight_init(self):
            for op in self.modules():
                if isinstance(op, nn.Linear):
                    nn.init.normal_(op.weight.data)
                    nn.init.normal_(op.bias.data)
    
    net = Net(1,10,1)
    net.weight_init()
    # print(net)
    
    optimizer = torch.optim.SGD(net.parameters(),lr = 0.01) # modified
    loss_func = torch.nn.MSELoss()
    
    for t in range(50000): # modified
        prediction = net(x)
        loss = loss_func(prediction, y)
    
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        if(t%50 == 0):
            print('Loss = %.4f' % loss.data)
    
    print(torch.exp(net(x))) # added
    

    I also recommend normalizing your dataset after the logarithmic rescaling, for instance by dividing by the standard deviation and subtracting the mean.