I am working on to create a machine learning project with PyTorch, and I realize my model does not seem to learn - it always outputs a flat line with little changes, and the loss hardly reduces. In order to spot the issue, I reduced my program to another minimum program (the program in question), yet the problem remains.
In this reduced program, I aim to make a model to approach to the shape of y=x^2. I copied and pasted my model class ANN
from my original bigger project, which has 6 features. For simplicity, I set first 5 features to 0 and the last one to be the actual x
(evenly separated numbers on interval from -2 to 2), and based on x
generate corresponding y
values by my generate_targets()
.
from torch import tensor, float32
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
class ANN(nn.Module):
def __init__(self, feature_num: int):
super(ANN,self).__init__()
self.layers = nn.Sequential(
nn.Linear(feature_num, 300),
nn.Tanh(),
nn.Linear(300, 200),
nn.Tanh(),
nn.Linear(200, 150),
nn.Tanh(),
nn.Linear(150, 50),
nn.Tanh(),
nn.Linear(50, 1)
)
def forward(self, x):
predictions = self.layers(x)
return predictions
class TestDataset(Dataset):
def __init__(self, sample_num):
self.sample_num = sample_num
self.func_max = 2
self.func_min = -2
self.unit = (self.func_max - self.func_min) / self.sample_num
self.targets = generate_targets(self.sample_num)
def __getitem__(self, index):
x = self.func_min + index * self.unit
return tensor([0, 0, 0, 0, 0, x], dtype=float32), self.targets[index]
def __len__(self):
return self.sample_num
# Generate the list of y
def generate_targets(count):
func_max = 2
func_min = -2
unit = (func_max - func_min)/count
target_list = []
for i in range(count):
x = func_min + unit*i
y = x ** 2
target_list.append(y)
return target_list
# The main program
def start_train():
sample_num = 500
train_data = TestDataset(sample_num)
train_dataloader = DataLoader(train_data, batch_size=10, shuffle=True)
model = ANN(6)
model.train()
mae_loss = nn.L1Loss()
optimizer = optim.Adam(model.parameters())
loss_list = []
for i in range(sample_num):
train_feature, train_target = next(iter(train_dataloader))
prediction = model(train_feature)
loss = mae_loss(prediction, train_target.float().unsqueeze(1))
loss.backward()
optimizer.step()
optimizer.zero_grad()
loss_list += [loss.item()]
print(f"iteration [{i + 1}/{sample_num}] Loss = {loss.item():.3f}")
plt.plot(range(sample_num), loss_list, marker='o', label='Validation')
plt.xlabel('iterations')
plt.ylabel('MAE loss')
plt.title('Loss vs Iteration')
plt.legend(loc='upper right')
plt.savefig('Debugger_Loss_Iter.png')
plt.close()
comparison(model, sample_num)
# Evaluate the trained model by plugging in each x coord and see the generated comparative graph
def comparison(model: ANN, sample_num: int) -> None:
model.eval()
prediction_list = []
for i in range(sample_num):
train_feature = tensor([0, 0, 0, 0, 0, i / sample_num], dtype=float32)
prediction = model(train_feature)
prediction_list += [prediction.item()]
target_list = generate_targets(sample_num)
x_list = [(i - sample_num / 2) / sample_num for i in list(range(sample_num))]
plt.plot(x_list, target_list, marker='o', label='Target')
plt.plot(x_list, prediction_list, marker='o', label='Prediction')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Prediction-target Comparison')
plt.legend(loc='upper right')
plt.savefig('Debugger_Comparison.png')
plt.close()
if __name__ == '__main__':
start_train()
After executing, the output is something like the attached image. (On the image, the scale of the x label is wrong, but it should not be part of the learning issue)
The correct output that I am expecting is a shape close to the target, similar to the attached image.
A correct output image, from a single-time success for unknown reason which I failed to recreate
I notice that usually tensor mismatch is a common reason for not learning, and I have deliberately checked the shapes and dtypes of the incoming and outputting tensors. Unfortunately, I found those tensors align with my expectations.
train:
train_feature: (10,6) float32
train_target.shape([10,1]) float32
prediction: ([10,1]) float32
comparison:
train_feature:([10,6]) float32
prediction([10,1]) float32
The training is fine, but the comparison code is wrong though: You are sampling x uniformly in range(0,1). Try to correct it and train again.
for i in range(sample_num):
train_feature = tensor([0, 0, 0, 0, 0, i / sample_num], dtype=float32)