I implemented a simple gradient descent algorithm for linear regression, using mxnet.autograd.
Everything works just fine, but the performance is awful. I'm using vanilla gradient descent and not SGD, but I doubt that's the issue...if I simply use the analytic expression for the gradient, the whole process over 1000 epochs takes about 2s, but using autograd it get up top 147s.
This is the implementation of the code
from mxnet import nd, autograd, gluon
import pandas as pd
def main():
# learning algorithm parameters
nr_epochs = 1000
alpha = 0.01
# read data
data = pd.read_csv("dataset.txt", header=0, index_col=None, sep="\s+")
# ---------------------------------
# -- using gradient descent ---
# ---------------------------------
data.insert(0, "x_0", 1, True) # insert column of "1"s as x_0
m = data.shape[0] # number of samples
n = data.shape[1] - 1 # number of features
X = nd.array(data.iloc[:, 0:n].values) # array with x values
Y = nd.array(data.iloc[:, -1].values) # array with y values
theta = nd.zeros(n) # initial parameters array
theta.attach_grad() # declare gradient with respect to theta is needed
# ----------------------------------------------------------
theta, Loss = GradientDescent(X, Y, theta, alpha, nr_epochs)
# ----------------------------------------------------------
print("Theta by gradient descent:")
print(theta)
#--------------#
# END MAIN #
#--------------#
#-------------------#
# loss function #
#-------------------#
def LossFunction(X, Y, theta):
m = X.shape[0] # number of training samples
loss = 0
for i in range(X.shape[0]):
loss = loss + (1 / (2 * m)) * (H(X[i, :], theta) - Y[i]) ** 2
return loss
#----------------#
# hypothesis #
#----------------#
def H(x, theta):
return nd.dot(x, theta)
#----------------------#
# gradient descent #
#----------------------#
def GradientDescent(X, Y, theta, alpha, nr_epochs):
Loss = nd.zeros(nr_epochs) # array containing values of loss function over iterations
for epoch in range(nr_epochs):
with autograd.record():
loss = LossFunction(X, Y, theta)
loss.backward()
Loss[epoch] = loss
for j in range(len(theta)):
theta[j] = theta[j] - alpha * theta.grad[j]
return theta, Loss
if __name__ == "__main__":
main()
The bottleneck is the call to
theta, Loss = GradientDescent(X, Y, theta, alpha, nr_epochs)
Am I doing something wrong? I've seen some other examples, and those work much faster than mine, is there anything that I could modify to decrease the running time? Thanks!
The issue is that you are looping over your arrays to update the parameters. You should use a vectorized approach instead.
from mxnet import nd, autograd, gluon
import pandas as pd
def main():
# learning algorithm parameters
nr_epochs = 1000
alpha = 0.01
m = 10000
n = 50
X = nd.random.uniform(shape=(m, n))
Y = nd.random.uniform(shape=(m,1)) # array with y values
X.attach_grad()
Y.attach_grad()
theta = nd.zeros((n,1)) # initial parameters array
theta.attach_grad() # declare gradient with respect to theta is needed
# ----------------------------------------------------------
theta, Loss = GradientDescent(X, Y, theta, alpha, nr_epochs)
# ----------------------------------------------------------
print("Theta by gradient descent:")
print(theta)
#--------------#
# END MAIN #
#--------------#
#-------------------#
# loss function #
#-------------------#
def LossFunction(X, Y, theta):
m = X.shape[0] # number of training samples
loss = (1 / (2 * m)) * ((nd.dot(X, theta) - Y) ** 2).sum()
return loss
#----------------------#
# gradient descent #
#----------------------#
def GradientDescent(X, Y, theta, alpha, nr_epochs):
Loss = nd.zeros(nr_epochs) # array containing values of loss function over iterations
for epoch in range(nr_epochs):
theta.attach_grad()
with autograd.record():
loss = LossFunction(X, Y, theta)
loss.backward()
Loss[epoch] = loss.asnumpy()
theta = theta - alpha * theta.grad
return theta, Loss
if __name__ == '__main__':
main()
Here is an example that runs in 1 seconds with 10000 rows and 50 dimensions