Search code examples
pythonnumpyneural-networkbackpropagationloss-function

Can somebody help me to correctly derive the loss function?


I'm trying to adapt the example from http://cs231n.github.io/neural-networks-case-study/#together to make a neural network for a numeric target variable so it will be a neural network with regression. I surely do something wrong in the derivation part because my loss function in insanely growing. Here is the code:

h = neurons # size of hidden layer
D = X[0].size
K = 1
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))

# some hyperparameters
step_size = 1 #learning rate
reg = 0.001 # regularization strength

loss_vec = []
# gradient descent loop
num_examples = X.shape[0]
for i in xrange(1000):

  # evaluate class scores, [N x K]
  hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
  scores = np.dot(hidden_layer, W2) + b2

  loss = np.power(y - scores,2)
  #if i % 50 == 0:
  loss_vec.append(np.mean(np.abs(loss)))
  print "iteration %d: loss %f" % (i, np.mean(np.abs(loss)))

  # compute the gradient on scores
  dscores = 2*(y-scores) # here I am not sure is correct
    
  # backpropate the gradient to the parameters
  # first backprop into parameters W2 and b2
  dW2 = np.dot(hidden_layer.T, dscores)
  db2 = np.sum(dscores, axis=0, keepdims=True)
  # next backprop into hidden layer
  dhidden = np.dot(dscores, W2.T)
  # backprop the ReLU non-linearity
  dhidden[hidden_layer <= 0] = 0
  # finally into W,b
  dW = np.dot(X.T, dhidden)
  db = np.sum(dhidden, axis=0, keepdims=True)

  # add regularization gradient contribution
  dW2 += reg * W2
  dW += reg * W

  # perform a parameter update
  W += -step_size * dW
  b += -step_size * db
  W2 += -step_size * dW2
  b2 += -step_size * db2

Code output:

iteration 0: loss 5786.021888

iteration 1: loss 24248543152533318464172949461134213120.000000

iteration 2: loss 388137710832824223006297769344993376570435619092


Solution

  • I've noticed several important mistakes:

    • the learning rate is too big, no chance to learn anything. I used 0.0005, but it depends on the data, size of hidden layer, etc
    • the loss derivative dscores should be flipped: scores - y
    • the loss also ignores regularization (probably dropped for debugging purposes)

    Complete code below:

    import numpy as np
    
    # Generate data: learn the sum x[0] + x[1]
    np.random.seed(0)
    N = 100
    D = 2
    X_test = np.zeros([N, D])
    y = np.zeros([N, 1])
    for i in range(N):
      X_test[i, :] = np.random.random_integers(0, 4, size=2)
      y[i] = X_test[i, 0] + X_test[i, 1]
    
    # Network params
    H = 10
    W = 0.01 * np.random.randn(D, H)
    b = np.zeros([1, H])
    W2 = 0.01 * np.random.randn(H, 1)
    b2 = np.zeros([1, 1])
    
    # Hyper params
    step_size = 0.0005
    reg = 0.001
    
    for i in xrange(100):
      hidden_layer = np.maximum(0, np.dot(X_test, W) + b)
      scores = np.dot(hidden_layer, W2) + b2
    
      reg_loss = 0.5 * reg * np.sum(W * W) + 0.5 * reg * np.sum(W2 * W2)
      loss = np.mean(np.power(y - scores, 2)) + reg_loss
    
      print "iteration %d: loss %f" % (i, loss)
    
      dscores = (scores - y)
    
      dW2 = np.dot(hidden_layer.T, dscores)
      db2 = np.sum(dscores, axis=0, keepdims=True)
    
      dhidden = np.dot(dscores, W2.T)
      dhidden[hidden_layer <= 0] = 0
    
      dW = np.dot(X_test.T, dhidden)
      db = np.sum(dhidden, axis=0, keepdims=True)
    
      dW2 += reg * W2
      dW += reg * W
    
      W += -step_size * dW
      b += -step_size * db
      W2 += -step_size * dW2
      b2 += -step_size * db2
    
    # Test
    X_test = np.array([[1, 0], [0, 1], [2, 3], [2, 2]]).reshape([-1, 2])
    y_test = np.array([1, 1, 5, 4]).reshape([-1, 1])
    hidden_layer = np.maximum(0, np.dot(X_test, W) + b)
    scores = np.dot(hidden_layer, W2) + b2
    print 'Average test error = %f' % np.mean((scores - y_test).T)