python machine-learning linear-regression gradient-descent

overflow encountered in scalar power error (Linear Regression & Gradient Descent With Large Digit)

So i was trying out a manual gradient descent with a large digit and got overflow encountered in scalar power

I use this dataset from kaggle to calculate land price X = LT, Y = Harga https://www.kaggle.com/datasets/wisnuanggara/daftar-harga-rumah

The code i used to input into numpy array:

import os
import openpyxl
from openpyxl import Workbook
import numpy as np

wb = openpyxl.load_workbook('DATA RUMAH.xlsx')
ws = wb.active

y_train_data = np.array([])
x_train_data = np.array([])

def get_x_train():
    x_train = np.array([])  # Initialize x_train as a local variable
    for x in range(2, 1011):
        data = ws.cell(row=x, column=5).value
        x_train = np.append(x_train, data)
    return x_train

def get_y_train():
    y_train = np.array([])  # Initialize y_train as a local variable
    for y in range(2, 1011):
        data = ws.cell(row=y, column=3).value
        y_train = np.append(y_train, data)
    return y_train

Full Code

import math, copy
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('./deeplearning.mplstyle')
from lab_utils_uni import plt_house_x, plt_contour_wgrad, plt_divergence, plt_gradients

# Load our data set
x_train = get_x_train()   #features
y_train = get_y_train()  #target value

#Function to calculate the cost
def compute_cost(x, y, w, b):
   
    m = x.shape[0] 
    cost = 0
    
    for i in range(m):
        f_wb = w * x[i] + b
        cost = cost + (f_wb - y[i])**2
    total_cost = 1 / (2 * m) * cost

    return total_cost

def compute_gradient(x, y, w, b): 
    """
    Computes the gradient for linear regression 
    Args:
      x (ndarray (m,)): Data, m examples 
      y (ndarray (m,)): target values
      w,b (scalar)    : model parameters  
    Returns
      dj_dw (scalar): The gradient of the cost w.r.t. the parameters w
      dj_db (scalar): The gradient of the cost w.r.t. the parameter b     
     """
    
    # Number of training examples
    m = x.shape[0]    
    dj_dw = 0
    dj_db = 0
    
    for i in range(m):  
        f_wb = w * x[i] + b 
        dj_dw_i = (f_wb - y[i]) * x[i] 
        dj_db_i = f_wb - y[i] 
        dj_db += dj_db_i
        dj_dw += dj_dw_i 
    dj_dw = dj_dw / m 
    dj_db = dj_db / m 
        
    return dj_dw, dj_db

def gradient_descent(x, y, w_in, b_in, alpha, num_iters, cost_function, gradient_function):
    """
    Performs gradient descent to fit w,b. Updates w,b by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
        x (ndarray (m,))  : Data, m examples 
        y (ndarray (m,))  : target values
        w_in,b_in (scalar): initial values of model parameters  
        alpha (float):     Learning rate
        num_iters (int):   number of iterations to run gradient descent
        cost_function:     function to call to produce cost
        gradient_function: function to call to produce gradient
      
    Returns:
        w (scalar): Updated value of parameter after running gradient descent
        b (scalar): Updated value of parameter after running gradient descent
        J_history (List): History of cost values
        p_history (list): History of parameters [w,b] 
    """
    
    # Specify data type as np.float64 for w, b
    w = np.float64(w_in)
    b = np.float64(b_in)
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    p_history = []
    
    for i in range(num_iters):
        # Calculate the gradient and update the parameters using gradient_function
        dj_dw, dj_db = gradient_function(x, y, w , b)     

        # Update Parameters using equation (3) above
        b = b - alpha * dj_db                            
        w = w - alpha * dj_dw                            

        # Save cost J at each iteration
        J_history.append(cost_function(x, y, w, b))
        p_history.append([w, b])

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i % math.ceil(num_iters/10) == 0:
            print(f"Iteration {i:4}: Cost {J_history[-1]:0.2e} ",
                  f"dj_dw: {dj_dw: 0.3e}, dj_db: {dj_db: 0.3e}  ",
                  f"w: {w: 0.3e}, b: {b: 0.5e}")
 
    return w, b, J_history, p_history  # Return w and J,w history for graphing

# Initialize parameters with np.float64 data type
w_init = np.float64(0)
b_init = np.float64(0)

# Some gradient descent settings
iterations = 100000
tmp_alpha = np.float64(1.0e-4)

# Run gradient descent
w_final, b_final, J_hist, p_hist = gradient_descent(x_train, y_train, w_init, b_init, tmp_alpha,
                                                    iterations, compute_cost, compute_gradient)

# Print the result
print(f"(w, b) found by gradient descent: ({w_final:8.4f}, {b_final:8.4f})")

Output:

    RuntimeWarning: overflow encountered in scalar add
  cost = np.float64(cost + (f_wb - y[i])**2)
    RuntimeWarning: overflow encountered in scalar power
  cost = np.float64(cost + (f_wb - y[i])**2)
    RuntimeWarning: overflow encountered in scalar add
  dj_dw += np.float64(dj_dw_i)
    RuntimeWarning: invalid value encountered in scalar subtract
  w = np.float64(w - alpha * dj_dw)

i tried to normalize but i think it skews the data too much, how do i make it so that the gradient descent can process the huge digit?

Solution

Changed my learning rate and iterations to

iterations = 1000000
tmp_alpha = np.float64(1.0e-10)

And it worked:

> Iteration    0: Cost 5.60e+19  dj_dw: -2.878e+12, dj_db: -7.626e+09   w:  2.878e+02, b:  7.62614e-01
Iteration 100000: Cost 1.72e+19  dj_dw: -1.186e+12, dj_db: -3.097e+09   w:  1.909e+07, b:  5.03158e+04
Iteration 200000: Cost 1.06e+19  dj_dw: -4.890e+11, dj_db: -1.231e+09   w:  2.696e+07, b:  7.05947e+04
Iteration 300000: Cost 9.51e+18  dj_dw: -2.015e+11, dj_db: -4.613e+08   w:  3.020e+07, b:  7.84937e+04
Iteration 400000: Cost 9.32e+18  dj_dw: -8.307e+10, dj_db: -1.442e+08   w:  3.154e+07, b:  8.12901e+04
Iteration 500000: Cost 9.29e+18  dj_dw: -3.424e+10, dj_db: -1.351e+07   w:  3.209e+07, b:  8.19834e+04
Iteration 600000: Cost 9.29e+18  dj_dw: -1.411e+10, dj_db:  4.036e+07   w:  3.231e+07, b:  8.18099e+04
Iteration 700000: Cost 9.29e+18  dj_dw: -5.816e+09, dj_db:  6.256e+07   w:  3.241e+07, b:  8.12791e+04
Iteration 800000: Cost 9.29e+18  dj_dw: -2.397e+09, dj_db:  7.172e+07   w:  3.245e+07, b:  8.06010e+04
Iteration 900000: Cost 9.29e+18  dj_dw: -9.883e+08, dj_db:  7.549e+07   w:  3.246e+07, b:  7.98622e+04
(w, b) found by gradient descent: (32469417.2368, 79098.4748)