I am trying to train a neural net model on a custom loss function, with a sigmoid activation with binary labels. When I'm trying to figure out the prediction output, for some reason all the outputs turn out to be negative ALWAYS and very close to a fixed value.
def train_net(X, y, c_reg, batch_size=16, lr=1e-2, p=80, d=20, test_frac=0.2, T=50000, rng_seed=42):
.
.
.
W = torch.as_tensor(rng.normal(0.0, 1.0, (p, d)) / np.sqrt(p), device = 'cuda')
W.requires_grad_(True)
# weights of the first layer a p x dim matrix whose each entry ~ 1/sqrt{p}*N(0,1)
a = torch.as_tensor(rng.normal(0.0, 1.0, (1, p)) / np.sqrt(p), device = 'cuda')
# weights of the second layer ~ (1/sqrt{m})*N(0,1)
a = torch.div(a, torch.linalg.vector_norm(a))
a.requires_grad_(False)
# Normalize a s.t. ||a|| = 1
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=test_frac, random_state=42)
dataloader = torch.utils.data.DataLoader(
torch.cat((X_train, y_train), dim=1), # p x (d+1) tensor
batch_size=batch_size,
shuffle=True
)
for i in range(T):
W0 = W.data
for idx, batch in enumerate(dataloader):
# batch_x is B x d
batch_x = batch[:, :-1]
# batch_y is B x 1
batch_y = batch[:, -1].reshape(-1, 1)
ipdb.set_trace()
train_output = torch.sigmoid(F.linear(input=batch_x, weight=W)).mm(a.t())
loss = torch.mean(torch.log(1+torch.exp(torch.mul(train_output.mul(batch_y), -1.0)))) + (c_reg / 2) * torch.linalg.matrix_norm(W) ** 2
ipdb.set_trace()
# running gradient descent
W.data = W.data - stepsize * grad(loss, W)[0]
with torch.no_grad():
epoch_train_pred = torch.sigmoid(F.linear(input=X_train, weight=W)).matmul(a.t())
epoch_test_pred = torch.sigmoid(F.linear(input=X_test, weight=W)).matmul(a.t())
val = epoch_train_pred.mul(y_train)
ipdb.set_trace()
epoch_train_loss = torch.mean(torch.log(1+torch.exp(torch.mul(epoch_train_pred.mul(y_train), -1.0)))) + (c_reg / 2) * (torch.linalg.matrix_norm(W) ** 2).float().item()
gated_epoch_test_pred = torch.sign(epoch_test_pred)
epoch_acc = 1 - torch.mean(torch.isclose(gated_epoch_test_pred, y_test).long())
ipdb.set_trace()
.
.
.
Here x
is generated from a randomly sampled unit norm distribution of norm 1, y
is randomly sampled from {+1, -1} and a
is drawn from a normal distribution with norm 1
tensor([[-0.5644],
[-0.5627],
[-0.5651],
[-0.5663],
[-0.5649],
[-0.5650],
[-0.5638],
[-0.5640],
[-0.5650],
[-0.5644],
[-0.5649],
[-0.5651],
[-0.5647],
[-0.5642],
[-0.5648],
[-0.5646],
[-0.5647],
[-0.5647],
[-0.5647],
[-0.5654],
[-0.5648],
[-0.5648],
[-0.5638],
[-0.5641],
[-0.5650],
[-0.5649],
[-0.5644],
[-0.5662],
[-0.5652],
[-0.5638],
[-0.5649],
[-0.5640]], device='cuda:0', grad_fn=<MmBackward0>)
This turns out to be the output for test_prediction va
I' ve tried debugging but have failed. I noticed that everywhere train_output always seems to be near -0.56xx
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import grad
import matplotlib.pyplot as plt
import seaborn as sb
import math
from sklearn.model_selection import train_test_split
torch.set_default_dtype(torch.float64)
ef unit_norm_data(n,d):
rng = np.random.default_rng(seed=16)
x_arr = rng.random(size=(n, d))
x_arr = 2*x_arr - 1
x = torch.as_tensor(x_arr.reshape(len(x_arr),-1), device='cuda')
x = torch.div(x, torch.linalg.norm(x) )
x.requires_grad_(False)
# Normalized Data
y_arr = rng.choice([-1, 1], size=n)
y = torch.as_tensor(y_arr.reshape(len(y_arr),-1), device='cuda')
y.requires_grad_(False)
return x, y
def train_net(X, y, c_reg, batch_size=16, lr=1e-2, p=80, d=20, test_frac=0.2, T=50000, rng_seed=42):
'''
This function runs a simple one layer neural network on the training set (X,Y) where X and Y are vectors of n samples
m is the number of hidden units, T is the number of iterations
returns the predictors for X
'''
train_loss_reg = []
train_loss_unreg = []
test_loss_unreg = []
test_acc = []
rng = np.random.default_rng(seed=rng_seed)
Bx = 1
stepsize = lr
W = torch.as_tensor(rng.normal(0.0, 1.0, (p, d)) / np.sqrt(p), device = 'cuda')
W.requires_grad_(True)
# weights of the first layer a p x dim matrix whose each entry ~ 1/sqrt{p}*N(0,1)
a_var = torch.as_tensor(rng.uniform(-1.0, 1.0, (1, p)), device = 'cuda')
# weights of the second layer ~ (1/sqrt{m})*N(0,1)
a_var = torch.div(a_var, torch.linalg.vector_norm(a_var))
a_var.requires_grad_(False)
# Normalize a s.t. ||a|| = 1
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=test_frac, random_state=42)
dataloader = torch.utils.data.DataLoader(
torch.cat((X_train, y_train), dim=1), # p x (d+1) tensor
batch_size=batch_size,
shuffle=True
)
for i in range(T):
W0 = W.data
for idx, batch in enumerate(dataloader):
# batch_x is B x d
batch_x = batch[:, :-1]
# batch_y is B x 1
batch_y = batch[:, -1].reshape(-1, 1)
train_output = torch.sigmoid(F.linear(input=batch_x, weight=W)).mm(a_var.t())
loss = torch.mean(torch.log(1+torch.exp(torch.mul(train_output.mul(batch_y), -1.0)))) + (c_reg / 2) * torch.linalg.matrix_norm(W) ** 2
# running gradient descent
W.data = W.data - stepsize * grad(loss, W)[0]
with torch.no_grad():
epoch_train_pred = torch.sigmoid(F.linear(input=X_train, weight=W)).matmul(a_var.t())
epoch_test_pred = torch.sigmoid(F.linear(input=X_test, weight=W)).matmul(a_var.t())
val = epoch_train_pred.mul(y_train)
epoch_train_loss = torch.mean(torch.log(1+torch.exp(torch.mul(epoch_train_pred.mul(y_train), -1.0)))) + (c_reg / 2) * (torch.linalg.matrix_norm(W) ** 2).float().item()
gated_epoch_test_pred = torch.sign(epoch_test_pred)
epoch_acc = 1 - torch.mean(torch.where(gated_epoch_test_pred ==y_test, 1.0, 0))
if i % 1000 == 0 or i == T-1:
print(f"Epoch {i} -------- \nReg. train loss: {epoch_train_loss:.6f}")
print(f"Unreg. train loss: {torch.mean(torch.log(1+torch.exp(-1*(epoch_train_pred.mul(y_train))))).item():.6f}")
print(f"Test loss: {torch.mean(torch.log(1+torch.exp(torch.mul(epoch_test_pred.mul(y_test), -1.0)))).item():.6f}")
print(f"Test Accuracy: {epoch_acc:.6f}\n")
train_loss_reg.append(epoch_train_loss)
test_acc.append(epoch_acc)
train_loss_unreg.append(torch.mean(torch.log(1+torch.exp(torch.mul(epoch_train_pred.mul(y_train), -1.0)))).item())
test_loss_unreg.append(torch.mean(torch.log(1+torch.exp(torch.mul(epoch_test_pred.mul(y_test), -1.0)))).item() )
del W, a
return train_loss_reg, train_loss_unreg, test_loss_unreg, test_acc
def run_experiment(X, Y, n, m, T, lr, c_r, d, test_frac, batch_size=16):
'''
Generates inputs using the input generator and train a neural network to
estimate the input. m is the number of hidden units, T is the number of
iterations, n is the number of samples.
Note: x, y = input_generator(n) where input_generator returns n input
samples x, y where x is an indepedent RV, and y is the dependent one.
'''
rng = np.random.default_rng(seed=42)
By = 0.3
print(f"Using m={m}\n")
test_losses = []
test_acc_arr = []
# noise_frac = 1.0
# noise_mask = torch.from_numpy(rng.choice([0.0, 1.0], size=Y.shape, p=[1-noise_frac, noise_frac])).to('cuda')
# Y_noise = Y + noise_mask * torch.from_numpy(0.05 * rng.standard_cauchy(size=Y.shape)).to('cuda')
# Y_noise = Y + torch.from_numpy(0.1 * rng.standard_normal(size=Y.shape)).to('cuda')
for idx, lam in enumerate(c_r):
print(f"Using c_reg = {lam}")
for rng_seed in [42, 55, 23]:
print(f"Using seed: {rng_seed}")
test_losses_seed = []
test_acc_seed = []
train_loss_reg, train_loss_unreg, test_loss_unreg, test_acc = train_net(X, Y, lam, batch_size, lr[idx], m, d, test_frac, T, rng_seed)
test_losses_seed.append(min(test_loss_unreg))
test_acc_seed.append(max(test_acc))
test_losses.append(sum(test_losses_seed)/len(test_losses_seed))
test_acc_arr.append(sum(test_acc_seed)/len(test_acc_seed))
return test_losses,test_acc
n_samples = 1024
input_d = 20
x, y = unit_norm_data(n_samples, input_d)
c_r_list = [1e-5, 1e-3, 1e-2, 0.03,0.1]
lr_list = [0.1] * len(c_r_list)
test_losses_width = []
test_acc_width = []
# 800, 1000, 2000, lr=0.8
# [5, 10, 100, 500, 1000], lr=0.01
for width in [100, 500, 1000]:
l, m =run_experiment(X=x, Y=y, n=n_samples, m=width, T=5000, lr=lr_list, c_r=c_r_list, d=input_d, test_frac=0.6, batch_size=32)
test_losses_width.append(l)
test_acc_width.append(m)
print('-'*75 + '\n' + '-'*75)
When the network starts training, you will typically see it trying to answer everything with the same value. This is because a random sigmoid matrix is multiplied by a random a_var
matrix. Keep track of this over epochs and you will see that your train
and test
outputs aren't really changing much.
The main reason for this is that your custom loss function (probably low powered, didn't check) along with the noisy data (random input gives random output) makes the gradients very weak.
Another obstacle the model needs to combat is the randomised a_var
it is getting as a non-trainable parameter. I understand you are using it to coalesce the sigmoid outputs which are from 0 to 1, to -1 to 1 but since this part of model doesn't learn, W
has to understand it as well.
You can combat these problems by either increasing your lr
to high numbers to overfit on training set
lr_list = [29, 17, 5, 2]
Or give a feasible learning task where the weights would start moving towards a minima. Let's say positive or negative detector.
plus = rng.uniform(1,4,size=(n//2, d))
minus = rng.uniform(-3,1.5,size=(n-n//2, d))
x_arr = np.concatenate([plus,minus])
x_arr = 2*x_arr - 1
x = torch.as_tensor(x_arr.reshape(len(x_arr),-1))
x = torch.div(x, torch.linalg.norm(x) )
x.requires_grad_(False)
y_arr = [1]*(n//2) + [-1]*(n-n//2)
y_arr = np.array(y_arr)
y = torch.as_tensor(y_arr.reshape(len(y_arr),-1))
y.requires_grad_(False)
I found both cases to start creating variety in outputs.
P.S. you are using train_test_split
opposite to its intended usage, and hence need to give a high fraction for test_frac
.