Strange output on using torch.nn.functional.linear

I am trying to train a neural net model on a custom loss function, with a sigmoid activation with binary labels. When I'm trying to figure out the prediction output, for some reason all the outputs turn out to be negative ALWAYS and very close to a fixed value.

def train_net(X, y, c_reg, batch_size=16, lr=1e-2, p=80, d=20, test_frac=0.2, T=50000, rng_seed=42): 
.
.
.

 W = torch.as_tensor(rng.normal(0.0, 1.0, (p, d)) / np.sqrt(p), device = 'cuda')

  W.requires_grad_(True)
  # weights of the first layer a p x dim matrix whose each entry ~ 1/sqrt{p}*N(0,1)

  a = torch.as_tensor(rng.normal(0.0, 1.0, (1, p)) / np.sqrt(p), device = 'cuda')
  # weights of the second layer ~ (1/sqrt{m})*N(0,1)

  a = torch.div(a, torch.linalg.vector_norm(a))
  a.requires_grad_(False)
  # Normalize a s.t. ||a|| = 1

  X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=test_frac, random_state=42) 

  dataloader = torch.utils.data.DataLoader(
    torch.cat((X_train, y_train), dim=1),      # p x (d+1) tensor 
    batch_size=batch_size,
    shuffle=True
  )

  for i in range(T):
    W0 = W.data
    for idx, batch in enumerate(dataloader):
      # batch_x is B x d
      batch_x = batch[:, :-1]
      # batch_y is B x 1
      batch_y = batch[:, -1].reshape(-1, 1)
      ipdb.set_trace()

      train_output = torch.sigmoid(F.linear(input=batch_x, weight=W)).mm(a.t())
      loss = torch.mean(torch.log(1+torch.exp(torch.mul(train_output.mul(batch_y), -1.0)))) + (c_reg / 2) * torch.linalg.matrix_norm(W) ** 2
      ipdb.set_trace()

      # running gradient descent 
      W.data = W.data - stepsize * grad(loss, W)[0]

    with torch.no_grad():
      epoch_train_pred = torch.sigmoid(F.linear(input=X_train, weight=W)).matmul(a.t())
      epoch_test_pred = torch.sigmoid(F.linear(input=X_test, weight=W)).matmul(a.t())
      val = epoch_train_pred.mul(y_train)
      ipdb.set_trace() 
      epoch_train_loss = torch.mean(torch.log(1+torch.exp(torch.mul(epoch_train_pred.mul(y_train), -1.0)))) + (c_reg / 2) * (torch.linalg.matrix_norm(W) ** 2).float().item()
      gated_epoch_test_pred = torch.sign(epoch_test_pred)
      epoch_acc = 1 - torch.mean(torch.isclose(gated_epoch_test_pred, y_test).long())
      ipdb.set_trace()

.
.
.

Here x is generated from a randomly sampled unit norm distribution of norm 1, y is randomly sampled from {+1, -1} and a is drawn from a normal distribution with norm 1

tensor([[-0.5644],
        [-0.5627],
        [-0.5651],
        [-0.5663],
        [-0.5649],
        [-0.5650],
        [-0.5638],
        [-0.5640],
        [-0.5650],
        [-0.5644],
        [-0.5649],
        [-0.5651],
        [-0.5647],
        [-0.5642],
        [-0.5648],
        [-0.5646],
        [-0.5647],
        [-0.5647],
        [-0.5647],
        [-0.5654],
        [-0.5648],
        [-0.5648],
        [-0.5638],
        [-0.5641],
        [-0.5650],
        [-0.5649],
        [-0.5644],
        [-0.5662],
        [-0.5652],
        [-0.5638],
        [-0.5649],
        [-0.5640]], device='cuda:0', grad_fn=<MmBackward0>)

This turns out to be the output for test_prediction va

I' ve tried debugging but have failed. I noticed that everywhere train_output always seems to be near -0.56xx

Sample Code

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import grad

import matplotlib.pyplot as plt
import seaborn as sb
import math

from sklearn.model_selection import train_test_split

torch.set_default_dtype(torch.float64)

ef unit_norm_data(n,d):

  rng = np.random.default_rng(seed=16)

  x_arr = rng.random(size=(n, d))
  x_arr = 2*x_arr - 1
  x = torch.as_tensor(x_arr.reshape(len(x_arr),-1), device='cuda')
  x = torch.div(x, torch.linalg.norm(x) )
  x.requires_grad_(False)  
                                                    # Normalized Data

  y_arr = rng.choice([-1, 1], size=n)
  y = torch.as_tensor(y_arr.reshape(len(y_arr),-1), device='cuda')
  y.requires_grad_(False)

  return x, y


def train_net(X, y, c_reg, batch_size=16, lr=1e-2, p=80, d=20, test_frac=0.2, T=50000, rng_seed=42): 
  '''
        This function runs a simple one layer neural network on the training set (X,Y) where X and Y are vectors of n samples
        m is the number of hidden units, T is the number of iterations
        returns the predictors for X
  '''

  train_loss_reg = []
  train_loss_unreg = []
  test_loss_unreg = []
  test_acc = []

  rng = np.random.default_rng(seed=rng_seed)
  Bx = 1
  stepsize = lr


  W = torch.as_tensor(rng.normal(0.0, 1.0, (p, d)) / np.sqrt(p), device = 'cuda')

  W.requires_grad_(True)
  # weights of the first layer a p x dim matrix whose each entry ~ 1/sqrt{p}*N(0,1)

  a_var = torch.as_tensor(rng.uniform(-1.0, 1.0, (1, p)), device = 'cuda')
  # weights of the second layer ~ (1/sqrt{m})*N(0,1)

  a_var = torch.div(a_var, torch.linalg.vector_norm(a_var))
  a_var.requires_grad_(False)
  # Normalize a s.t. ||a|| = 1

  X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=test_frac, random_state=42) 

  dataloader = torch.utils.data.DataLoader(
    torch.cat((X_train, y_train), dim=1),      # p x (d+1) tensor 
    batch_size=batch_size,
    shuffle=True
  )

  for i in range(T):
    W0 = W.data
    for idx, batch in enumerate(dataloader):
      # batch_x is B x d
      batch_x = batch[:, :-1]
      # batch_y is B x 1
      batch_y = batch[:, -1].reshape(-1, 1)

      train_output = torch.sigmoid(F.linear(input=batch_x, weight=W)).mm(a_var.t())
      loss = torch.mean(torch.log(1+torch.exp(torch.mul(train_output.mul(batch_y), -1.0)))) + (c_reg / 2) * torch.linalg.matrix_norm(W) ** 2
      # running gradient descent 
      W.data = W.data - stepsize * grad(loss, W)[0]

    with torch.no_grad():
      epoch_train_pred = torch.sigmoid(F.linear(input=X_train, weight=W)).matmul(a_var.t())
      epoch_test_pred = torch.sigmoid(F.linear(input=X_test, weight=W)).matmul(a_var.t())
      val = epoch_train_pred.mul(y_train)
      epoch_train_loss = torch.mean(torch.log(1+torch.exp(torch.mul(epoch_train_pred.mul(y_train), -1.0)))) + (c_reg / 2) * (torch.linalg.matrix_norm(W) ** 2).float().item()
      gated_epoch_test_pred = torch.sign(epoch_test_pred)
      epoch_acc = 1 - torch.mean(torch.where(gated_epoch_test_pred ==y_test, 1.0, 0))

    if i % 1000 == 0 or i == T-1: 
      print(f"Epoch {i} -------- \nReg. train loss: {epoch_train_loss:.6f}")
      print(f"Unreg. train loss: {torch.mean(torch.log(1+torch.exp(-1*(epoch_train_pred.mul(y_train))))).item():.6f}")
      print(f"Test loss: {torch.mean(torch.log(1+torch.exp(torch.mul(epoch_test_pred.mul(y_test), -1.0)))).item():.6f}")
      print(f"Test Accuracy: {epoch_acc:.6f}\n")



    train_loss_reg.append(epoch_train_loss)
    test_acc.append(epoch_acc)
    train_loss_unreg.append(torch.mean(torch.log(1+torch.exp(torch.mul(epoch_train_pred.mul(y_train), -1.0)))).item())
    test_loss_unreg.append(torch.mean(torch.log(1+torch.exp(torch.mul(epoch_test_pred.mul(y_test), -1.0)))).item()  )

    
        
  del W, a
  return train_loss_reg, train_loss_unreg, test_loss_unreg, test_acc 



def run_experiment(X, Y, n, m, T, lr, c_r, d, test_frac, batch_size=16):
    '''
        Generates inputs using the input generator and train a neural network to
        estimate the input. m is the number of hidden units, T is the number of 
        iterations, n is the number of samples. 
        Note: x, y = input_generator(n) where input_generator returns n input
        samples x, y where x is an indepedent RV, and y is the dependent one. 
    '''
    rng = np.random.default_rng(seed=42)
    By = 0.3

    print(f"Using m={m}\n")
    
    test_losses = []
    test_acc_arr = []

    # noise_frac = 1.0
    # noise_mask = torch.from_numpy(rng.choice([0.0, 1.0], size=Y.shape, p=[1-noise_frac, noise_frac])).to('cuda')
    # Y_noise = Y + noise_mask * torch.from_numpy(0.05 * rng.standard_cauchy(size=Y.shape)).to('cuda')
    # Y_noise = Y + torch.from_numpy(0.1 * rng.standard_normal(size=Y.shape)).to('cuda')

    for idx, lam in enumerate(c_r):
      print(f"Using c_reg = {lam}")
      
      for rng_seed in [42, 55, 23]:
        print(f"Using seed: {rng_seed}")
        test_losses_seed = []
        test_acc_seed = []
        train_loss_reg, train_loss_unreg, test_loss_unreg, test_acc = train_net(X, Y, lam, batch_size, lr[idx], m, d, test_frac, T, rng_seed)
        test_losses_seed.append(min(test_loss_unreg))
        test_acc_seed.append(max(test_acc))
      
      test_losses.append(sum(test_losses_seed)/len(test_losses_seed))
      test_acc_arr.append(sum(test_acc_seed)/len(test_acc_seed))
    return test_losses,test_acc 

n_samples = 1024
input_d = 20

x, y = unit_norm_data(n_samples, input_d)

c_r_list = [1e-5, 1e-3, 1e-2, 0.03,0.1] 
lr_list = [0.1] * len(c_r_list) 
test_losses_width = []
test_acc_width = []

# 800, 1000, 2000, lr=0.8
# [5, 10, 100, 500, 1000], lr=0.01
for width in [100, 500, 1000]:
  l, m =run_experiment(X=x, Y=y, n=n_samples, m=width, T=5000, lr=lr_list, c_r=c_r_list, d=input_d, test_frac=0.6, batch_size=32)
  test_losses_width.append(l)
  test_acc_width.append(m)

  print('-'*75 + '\n' + '-'*75)

Solution

When the network starts training, you will typically see it trying to answer everything with the same value. This is because a random sigmoid matrix is multiplied by a random a_var matrix. Keep track of this over epochs and you will see that your train and test outputs aren't really changing much.

The main reason for this is that your custom loss function (probably low powered, didn't check) along with the noisy data (random input gives random output) makes the gradients very weak.

Another obstacle the model needs to combat is the randomised a_var it is getting as a non-trainable parameter. I understand you are using it to coalesce the sigmoid outputs which are from 0 to 1, to -1 to 1 but since this part of model doesn't learn, W has to understand it as well.

You can combat these problems by either increasing your lr to high numbers to overfit on training set

lr_list = [29, 17, 5, 2]

Or give a feasible learning task where the weights would start moving towards a minima. Let's say positive or negative detector.

plus = rng.uniform(1,4,size=(n//2, d))
minus = rng.uniform(-3,1.5,size=(n-n//2, d))
x_arr = np.concatenate([plus,minus])
x_arr = 2*x_arr - 1
x = torch.as_tensor(x_arr.reshape(len(x_arr),-1))
x = torch.div(x, torch.linalg.norm(x) )
x.requires_grad_(False)

y_arr = [1]*(n//2) + [-1]*(n-n//2)
y_arr = np.array(y_arr)
y = torch.as_tensor(y_arr.reshape(len(y_arr),-1))
y.requires_grad_(False)

I found both cases to start creating variety in outputs.

P.S. you are using train_test_split opposite to its intended usage, and hence need to give a high fraction for test_frac.