So i was trying out a manual gradient descent with a large digit and got overflow encountered in scalar power
I use this dataset from kaggle to calculate land price X = LT, Y = Harga https://www.kaggle.com/datasets/wisnuanggara/daftar-harga-rumah
The code i used to input into numpy array:
import os
import openpyxl
from openpyxl import Workbook
import numpy as np
wb = openpyxl.load_workbook('DATA RUMAH.xlsx')
ws = wb.active
y_train_data = np.array([])
x_train_data = np.array([])
def get_x_train():
x_train = np.array([]) # Initialize x_train as a local variable
for x in range(2, 1011):
data = ws.cell(row=x, column=5).value
x_train = np.append(x_train, data)
return x_train
def get_y_train():
y_train = np.array([]) # Initialize y_train as a local variable
for y in range(2, 1011):
data = ws.cell(row=y, column=3).value
y_train = np.append(y_train, data)
return y_train
Full Code
import math, copy
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('./deeplearning.mplstyle')
from lab_utils_uni import plt_house_x, plt_contour_wgrad, plt_divergence, plt_gradients
# Load our data set
x_train = get_x_train() #features
y_train = get_y_train() #target value
#Function to calculate the cost
def compute_cost(x, y, w, b):
m = x.shape[0]
cost = 0
for i in range(m):
f_wb = w * x[i] + b
cost = cost + (f_wb - y[i])**2
total_cost = 1 / (2 * m) * cost
return total_cost
def compute_gradient(x, y, w, b):
"""
Computes the gradient for linear regression
Args:
x (ndarray (m,)): Data, m examples
y (ndarray (m,)): target values
w,b (scalar) : model parameters
Returns
dj_dw (scalar): The gradient of the cost w.r.t. the parameters w
dj_db (scalar): The gradient of the cost w.r.t. the parameter b
"""
# Number of training examples
m = x.shape[0]
dj_dw = 0
dj_db = 0
for i in range(m):
f_wb = w * x[i] + b
dj_dw_i = (f_wb - y[i]) * x[i]
dj_db_i = f_wb - y[i]
dj_db += dj_db_i
dj_dw += dj_dw_i
dj_dw = dj_dw / m
dj_db = dj_db / m
return dj_dw, dj_db
def gradient_descent(x, y, w_in, b_in, alpha, num_iters, cost_function, gradient_function):
"""
Performs gradient descent to fit w,b. Updates w,b by taking
num_iters gradient steps with learning rate alpha
Args:
x (ndarray (m,)) : Data, m examples
y (ndarray (m,)) : target values
w_in,b_in (scalar): initial values of model parameters
alpha (float): Learning rate
num_iters (int): number of iterations to run gradient descent
cost_function: function to call to produce cost
gradient_function: function to call to produce gradient
Returns:
w (scalar): Updated value of parameter after running gradient descent
b (scalar): Updated value of parameter after running gradient descent
J_history (List): History of cost values
p_history (list): History of parameters [w,b]
"""
# Specify data type as np.float64 for w, b
w = np.float64(w_in)
b = np.float64(b_in)
# An array to store cost J and w's at each iteration primarily for graphing later
J_history = []
p_history = []
for i in range(num_iters):
# Calculate the gradient and update the parameters using gradient_function
dj_dw, dj_db = gradient_function(x, y, w , b)
# Update Parameters using equation (3) above
b = b - alpha * dj_db
w = w - alpha * dj_dw
# Save cost J at each iteration
J_history.append(cost_function(x, y, w, b))
p_history.append([w, b])
# Print cost every at intervals 10 times or as many iterations if < 10
if i % math.ceil(num_iters/10) == 0:
print(f"Iteration {i:4}: Cost {J_history[-1]:0.2e} ",
f"dj_dw: {dj_dw: 0.3e}, dj_db: {dj_db: 0.3e} ",
f"w: {w: 0.3e}, b: {b: 0.5e}")
return w, b, J_history, p_history # Return w and J,w history for graphing
# Initialize parameters with np.float64 data type
w_init = np.float64(0)
b_init = np.float64(0)
# Some gradient descent settings
iterations = 100000
tmp_alpha = np.float64(1.0e-4)
# Run gradient descent
w_final, b_final, J_hist, p_hist = gradient_descent(x_train, y_train, w_init, b_init, tmp_alpha,
iterations, compute_cost, compute_gradient)
# Print the result
print(f"(w, b) found by gradient descent: ({w_final:8.4f}, {b_final:8.4f})")
Output:
RuntimeWarning: overflow encountered in scalar add
cost = np.float64(cost + (f_wb - y[i])**2)
RuntimeWarning: overflow encountered in scalar power
cost = np.float64(cost + (f_wb - y[i])**2)
RuntimeWarning: overflow encountered in scalar add
dj_dw += np.float64(dj_dw_i)
RuntimeWarning: invalid value encountered in scalar subtract
w = np.float64(w - alpha * dj_dw)
i tried to normalize but i think it skews the data too much, how do i make it so that the gradient descent can process the huge digit?
Changed my learning rate and iterations to
iterations = 1000000
tmp_alpha = np.float64(1.0e-10)
And it worked:
> Iteration 0: Cost 5.60e+19 dj_dw: -2.878e+12, dj_db: -7.626e+09 w: 2.878e+02, b: 7.62614e-01
Iteration 100000: Cost 1.72e+19 dj_dw: -1.186e+12, dj_db: -3.097e+09 w: 1.909e+07, b: 5.03158e+04
Iteration 200000: Cost 1.06e+19 dj_dw: -4.890e+11, dj_db: -1.231e+09 w: 2.696e+07, b: 7.05947e+04
Iteration 300000: Cost 9.51e+18 dj_dw: -2.015e+11, dj_db: -4.613e+08 w: 3.020e+07, b: 7.84937e+04
Iteration 400000: Cost 9.32e+18 dj_dw: -8.307e+10, dj_db: -1.442e+08 w: 3.154e+07, b: 8.12901e+04
Iteration 500000: Cost 9.29e+18 dj_dw: -3.424e+10, dj_db: -1.351e+07 w: 3.209e+07, b: 8.19834e+04
Iteration 600000: Cost 9.29e+18 dj_dw: -1.411e+10, dj_db: 4.036e+07 w: 3.231e+07, b: 8.18099e+04
Iteration 700000: Cost 9.29e+18 dj_dw: -5.816e+09, dj_db: 6.256e+07 w: 3.241e+07, b: 8.12791e+04
Iteration 800000: Cost 9.29e+18 dj_dw: -2.397e+09, dj_db: 7.172e+07 w: 3.245e+07, b: 8.06010e+04
Iteration 900000: Cost 9.29e+18 dj_dw: -9.883e+08, dj_db: 7.549e+07 w: 3.246e+07, b: 7.98622e+04
(w, b) found by gradient descent: (32469417.2368, 79098.4748)