I'm following Andrew Ng Coursera course on Machine Learning and I tried to implement the Gradient Descent Algorithm in Python. I'm having trouble with the y-intercept parameter because it doesn't look like to go to the best value. Here's my code:
# IMPORTS
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Acquiring Data
# Source: https://github.com/mattnedrich/GradientDescentExample
data = pd.read_csv('data.csv')
def cost_function(a, b, x_values, y_values):
'''
Calculates the square mean error for a given dataset
with (x,y) pairs and the model y' = a + bx
a: y-intercept for the model
b: slope of the curve
x_values, y_values: points (x,y) of the dataset
'''
data_len = len(x_values)
total_error = sum([((a + b * x_values[i]) - y_values[i])**2
for i in range(data_len)])
return total_error / (2 * float(data_len))
def a_gradient(a, b, x_values, y_values):
'''
Partial derivative of the cost_function with respect to 'a'
a, b: values for 'a' and 'b'
x_values, y_values: points (x,y) of the dataset
'''
data_len = len(x_values)
a_gradient = sum([((a + b * x_values[i]) - y_values[i])
for i in range(data_len)])
return a_gradient / float(data_len)
def b_gradient(a, b, x_values, y_values):
'''
Partial derivative of the cost_function with respect to 'b'
a, b: values for 'a' and 'b'
x_values, y_values: points (x,y) of the dataset
'''
data_len = len(x_values)
b_gradient = sum([(((a + b * x_values[i]) - y_values[i]) * x_values[i])
for i in range(data_len)])
return b_gradient / float(data_len)
def gradient_descent_step(a_current, b_current, x_values, y_values, alpha):
'''
Give a step in direction of the minimum of the cost_function using
the 'a' and 'b' gradiants. Return new values for 'a' and 'b'.
a_current, b_current: the current values for 'a' and 'b'
x_values, y_values: points (x,y) of the dataset
'''
new_a = a_current - alpha * a_gradient(a_current, b_current, x_values, y_values)
new_b = b_current - alpha * b_gradient(a_current, b_current, x_values, y_values)
return (new_a, new_b)
def run_gradient_descent(a, b, x_values, y_values, alpha, precision, plot=False, verbose=False):
'''
Runs the gradient_descent_step function and updates (a,b) until
the value of the cost function varies less than 'precision'.
a, b: initial values for the point a and b in the cost_function
x_values, y_values: points (x,y) of the dataset
alpha: learning rate for the algorithm
precision: value for the algorithm to stop calculation
'''
iterations = 0
delta_cost = cost_function(a, b, x_values, y_values)
error_list = [delta_cost]
iteration_list = [0]
# The loop runs until the delta_cost reaches the precision defined
# When the variation in cost_function is small it means that the
# the function is near its minimum and the parameters 'a' and 'b'
# are a good guess for modeling the dataset.
while delta_cost > precision:
iterations += 1
iteration_list.append(iterations)
# Calculates the initial error with current a,b values
prev_cost = cost_function(a, b, x_values, y_values)
# Calculates new values for a and b
a, b = gradient_descent_step(a, b, x_values, y_values, alpha)
# Updates the value of the error
actual_cost = cost_function(a, b, x_values, y_values)
error_list.append(actual_cost)
# Calculates the difference between previous and actual error values.
delta_cost = prev_cost - actual_cost
# Plot the error in each iteration to see how it decreases
# and some information about our final results
if plot:
plt.plot(iteration_list, error_list, '-')
plt.title('Error Minimization')
plt.xlabel('Iteration',fontsize=12)
plt.ylabel('Error',fontsize=12)
plt.show()
if verbose:
print('Iterations = ' + str(iterations))
print('Cost Function Value = '+ str(cost_function(a, b, x_values, y_values)))
print('a = ' + str(a) + ' and b = ' + str(b))
return (actual_cost, a, b)
When I run the algorithm with:
run_gradient_descent(0, 0, data['x'], data['y'], 0.0001, 0.01)
I get (a = 0.0496688656535 and b = 1.47825808018)
But the best value for 'a' is around 7.9 (tried another resources for linear regression).
Also, if I change the initial guess for the parameter 'a' the algorithm simply try to adjust the parameter 'b'.
For example, if I set a = 200 and b = 0
run_gradient_descent(200, 0, data['x'], data['y'], 0.0001, 0.01)
I get (a = 199.933763331 and b = -2.44824996193)
I couldn't find anything wrong with the code and I realized that the problem is the initial guess for a
parameter. See my own answer above where I defined a helper function to get a range for search some values for initial a
guess.
The complete code for my Gradient Descent implementation could be found on my Github repository: Gradient Descent for Linear Regression
Thinking about what @relay said that the Gradient Descent algorithm does not guarantee to find the global minima I tried to come up with an helper function to limit guesses for the parameter a
in a certain search range, as follows:
def search_range(x, y, plot=False):
'''
Given a dataset with points (x, y) searches for a best guess for
initial values of 'a'.
'''
data_lenght = len(x) # Total size of of the dataset
q_lenght = int(data_lenght / 4) # Size of a quartile of the dataset
# Finding the max and min value for y in the first quartile
min_Q1 = (x[0], y[0])
max_Q1 = (x[0], y[0])
for i in range(q_lenght):
temp_point = (x[i], y[i])
if temp_point[1] < min_Q1[1]:
min_Q1 = temp_point
if temp_point[1] > max_Q1[1]:
max_Q1 = temp_point
# Finding the max and min value for y in the 4th quartile
min_Q4 = (x[data_lenght - 1], y[data_lenght - 1])
max_Q4 = (x[data_lenght - 1], y[data_lenght - 1])
for i in range(data_lenght - 1, data_lenght - q_lenght, -1):
temp_point = (x[i], y[i])
if temp_point[1] < min_Q4[1]:
min_Q4 = temp_point
if temp_point[1] > max_Q4[1]:
max_Q4 = temp_point
mean_Q4 = (((min_Q4[0] + max_Q4[0]) / 2), ((min_Q4[1] + max_Q4[1]) / 2))
# Finding max_y and min_y given the points found above
# Two lines need to be defined, L1 and L2.
# L1 will pass through min_Q1 and mean_Q4
# L2 will pass through max_Q1 and mean_Q4
# Calculatin slope for L1 and L2 given m = Delta(y) / Delta (x)
slope_L1 = (min_Q1[1] - mean_Q4[1]) / (min_Q1[0] - mean_Q4[0])
slope_L2 = (max_Q1[1] - mean_Q4[1]) / (max_Q1[0] -mean_Q4[0])
# Calculating y-intercepts for L1 and L2 given line equation in the form y = mx + b
# Float numbers are converted to int because they will be used as range for itaration
y_L1 = int(min_Q1[1] - min_Q1[0] * slope_L1)
y_L2 = int(max_Q1[1] - max_Q1[0] * slope_L2)
# Ploting L1 and L2
if plot:
L1 = [(y_L1 + slope_L1 * x) for x in data['x']]
L2 = [(y_L2 + slope_L2 * x) for x in data['x']]
plt.plot(data['x'], data['y'], '.')
plt.plot(data['x'], L1, '-', color='r')
plt.plot(data['x'], L2, '-', color='r')
plt.title('Scatterplot of Sample Data')
plt.xlabel('x',fontsize=12)
plt.ylabel('y',fontsize=12)
plt.show()
return y_L1, y_L2
The idea is to run the gradient descent with guesses for a
within the range given by search_range() function and get the minimum possible value for the cost_function(). The new way to run the gradient descente becomes:
def run_search_gradient_descent(x_values, y_values, alpha, precision, verbose=False):
'''
Runs the gradient_descent_step function and updates (a,b) until
the value of the cost function varies less than 'precision'.
x_values, y_values: points (x,y) of the dataset
alpha: learning rate for the algorithm
precision: value for the algorithm to stop calculation
'''
from math import inf
a1, a2 = search_range(x_values, y_values)
best_guess = [inf, 0, 0]
for a in range(a1, a2):
cost, linear_coef, slope = run_gradient_descent(a, 0, x_values, y_values, alpha, precision)
# Saving value for cost_function and parameters (a,b)
if cost < best_guess[0]:
best_guess = [cost, linear_coef, slope]
if verbose:
print('Cost Function = ' + str(best_guess[0]))
print('a = ' + str(best_guess[1]) + ' and b = ' + str(best_guess[2]))
return (best_guess[0], best_guess[1], best_guess[2])
Running the code
run_search_gradient_descent(data['x'], data['y'], 0.0001, 0.001, verbose=True)
I've got:
Cost Function = 55.1294483959
a = 8.02595996606 and b = 1.3209768383
For comparison, using the linear regression from scipy.stats it returned
a = 7.99102098227and b = 1.32243102276