I'm trying to implement a multiclass logistic regression classifier that distinguishes between k
different classes.
This is my code.
import numpy as np
from scipy.special import expit
def cost(X,y,theta,regTerm):
(m,n) = X.shape
J = (np.dot(-(y.T),np.log(expit(np.dot(X,theta))))-np.dot((np.ones((m,1))-y).T,np.log(np.ones((m,1)) - (expit(np.dot(X,theta))).reshape((m,1))))) / m + (regTerm / (2 * m)) * np.linalg.norm(theta[1:])
return J
def gradient(X,y,theta,regTerm):
(m,n) = X.shape
grad = np.dot(((expit(np.dot(X,theta))).reshape(m,1) - y).T,X)/m + (np.concatenate(([0],theta[1:].T),axis=0)).reshape(1,n)
return np.asarray(grad)
def train(X,y,regTerm,learnRate,epsilon,k):
(m,n) = X.shape
theta = np.zeros((k,n))
for i in range(0,k):
previousCost = 0;
currentCost = cost(X,y,theta[i,:],regTerm)
while(np.abs(currentCost-previousCost) > epsilon):
print(theta[i,:])
theta[i,:] = theta[i,:] - learnRate*gradient(X,y,theta[i,:],regTerm)
print(theta[i,:])
previousCost = currentCost
currentCost = cost(X,y,theta[i,:],regTerm)
return theta
trX = np.load('trX.npy')
trY = np.load('trY.npy')
theta = train(trX,trY,2,0.1,0.1,4)
I can verify that cost and gradient are returning values that are in the right dimension (cost returns a scalar, and gradient returns a 1 by n row vector), but i get the error
RuntimeWarning: divide by zero encountered in log
J = (np.dot(-(y.T),np.log(expit(np.dot(X,theta))))-np.dot((np.ones((m,1))-y).T,np.log(np.ones((m,1)) - (expit(np.dot(X,theta))).reshape((m,1))))) / m + (regTerm / (2 * m)) * np.linalg.norm(theta[1:])
why is this happening and how can i avoid this?
You can clean up the formula by appropriately using broadcasting, the operator *
for dot products of vectors, and the operator @
for matrix multiplication — and breaking it up as suggested in the comments.
Here is your cost function:
def cost(X, y, theta, regTerm):
m = X.shape[0] # or y.shape, or even p.shape after the next line, number of training set
p = expit(X @ theta)
log_loss = -np.average(y*np.log(p) + (1-y)*np.log(1-p))
J = log_loss + regTerm * np.linalg.norm(theta[1:]) / (2*m)
return J
You can clean up your gradient function along the same lines.
By the way, are you sure you want np.linalg.norm(theta[1:])
. If you're trying to do L2-regularization, the term should be np.linalg.norm(theta[1:]) ** 2
.