python-3.x linear-regression gradient-descent mxnet

Gradient descent for linear regression with mxnet.autograd - performance issues

I implemented a simple gradient descent algorithm for linear regression, using mxnet.autograd.
Everything works just fine, but the performance is awful. I'm using vanilla gradient descent and not SGD, but I doubt that's the issue...if I simply use the analytic expression for the gradient, the whole process over 1000 epochs takes about 2s, but using autograd it get up top 147s.

This is the implementation of the code


from mxnet import nd, autograd, gluon
import pandas as pd



def main():
    # learning algorithm parameters
    nr_epochs = 1000
    alpha = 0.01

    # read data
    data = pd.read_csv("dataset.txt", header=0, index_col=None, sep="\s+")


    # ---------------------------------
    # --   using gradient descent   ---
    # ---------------------------------
    data.insert(0, "x_0", 1, True)                              # insert column of "1"s as x_0
    m = data.shape[0]                                           # number of samples
    n = data.shape[1] - 1                                       # number of features
    X = nd.array(data.iloc[:, 0:n].values)                      # array with x values
    Y = nd.array(data.iloc[:, -1].values)                       # array with y values

    theta = nd.zeros(n)                                         # initial parameters array
    theta.attach_grad()                                         # declare gradient with respect to theta is needed
    # ----------------------------------------------------------
    theta, Loss = GradientDescent(X, Y, theta, alpha, nr_epochs)
    # ----------------------------------------------------------

    print("Theta by gradient descent:")
    print(theta)


#--------------#
#   END MAIN   #
#--------------#



#-------------------#
#   loss function   #
#-------------------#
def LossFunction(X, Y, theta):
    m = X.shape[0]                  # number of training samples
    loss = 0
    for i in range(X.shape[0]):
        loss = loss + (1 / (2 * m)) * (H(X[i, :], theta) - Y[i]) ** 2
    return loss


#----------------#
#   hypothesis   #
#----------------#
def H(x, theta):
    return nd.dot(x, theta)



#----------------------#
#   gradient descent   #
#----------------------#
def GradientDescent(X, Y, theta, alpha, nr_epochs):

    Loss = nd.zeros(nr_epochs)                                 # array containing values of loss function over iterations

    for epoch in range(nr_epochs):
        with autograd.record():
            loss = LossFunction(X, Y, theta)
        loss.backward()
        Loss[epoch] = loss

        for j in range(len(theta)):
            theta[j] = theta[j] - alpha * theta.grad[j]

    return theta, Loss




if __name__ == "__main__":
    main()

The bottleneck is the call to

theta, Loss = GradientDescent(X, Y, theta, alpha, nr_epochs)

Am I doing something wrong? I've seen some other examples, and those work much faster than mine, is there anything that I could modify to decrease the running time? Thanks!

Solution

The issue is that you are looping over your arrays to update the parameters. You should use a vectorized approach instead.

from mxnet import nd, autograd, gluon
import pandas as pd


def main():
    # learning algorithm parameters
    nr_epochs = 1000
    alpha = 0.01

    m = 10000
    n = 50
    X = nd.random.uniform(shape=(m, n))
    Y = nd.random.uniform(shape=(m,1))                       # array with y values
    X.attach_grad()
    Y.attach_grad()

    theta = nd.zeros((n,1))                                         # initial parameters array
    theta.attach_grad()                                         # declare gradient with respect to theta is needed
    # ----------------------------------------------------------
    theta, Loss = GradientDescent(X, Y, theta, alpha, nr_epochs)
    # ----------------------------------------------------------

    print("Theta by gradient descent:")
    print(theta)


#--------------#
#   END MAIN   #
#--------------#



#-------------------#
#   loss function   #
#-------------------#
def LossFunction(X, Y, theta):
    m = X.shape[0]                  # number of training samples
    loss = (1 / (2 * m)) * ((nd.dot(X, theta) - Y) ** 2).sum()
    return loss



#----------------------#
#   gradient descent   #
#----------------------#
def GradientDescent(X, Y, theta, alpha, nr_epochs):

    Loss = nd.zeros(nr_epochs)                                 # array containing values of loss function over iterations

    for epoch in range(nr_epochs):
        theta.attach_grad()
        with autograd.record():
            loss = LossFunction(X, Y, theta)
        loss.backward()
        Loss[epoch] = loss.asnumpy()
        theta = theta - alpha * theta.grad

    return theta, Loss

if __name__ == '__main__':
    main()

Here is an example that runs in 1 seconds with 10000 rows and 50 dimensions