I was trying to make my mean squared error cost lower by scaling the target feature, primarily because it reaches 1e10's in digit
I use this dataset from kaggle to calculate land price X = LT, Y = Harga https://www.kaggle.com/datasets/wisnuanggara/daftar-harga-rumah
The code i used to input into numpy array:
import os
import openpyxl
from openpyxl import Workbook
import numpy as np
wb = openpyxl.load_workbook('DATA RUMAH.xlsx')
ws = wb.active
y_train_data = np.array([])
x_train_data = np.array([])
def get_x_train():
x_train = np.array([]) # Initialize x_train as a local variable
for x in range(2, 1011):
data = ws.cell(row=x, column=5).value
x_train = np.append(x_train, data)
return x_train
def get_y_train():
y_train = np.array([]) # Initialize y_train as a local variable
for y in range(2, 1011):
data = ws.cell(row=y, column=3).value
y_train = np.append(y_train, data)
return y_train
Linear regression & Gradient Descent Code:
import math, copy
import numpy as np
import matplotlib.pyplot as plt
from excltool import *
import pandas as pd
import seaborn as sns
%matplotlib inline
# Load our data set
x_train = get_x_train()#features
y_train = get_y_train() #target value
mean = np.mean(y_train)
min = np.min(y_train)
max = np.max(y_train)
y_train = np.array([(i - min) / (max - min) for i in y_train])
#Function to calculate the cost
def compute_cost(x, y, w, b):
m = x.shape[0]
cost = np.float64(0)
for i in range(m):
f_wb = (w * x[i] + b)
cost = (cost + (f_wb - y[i])**2)
total_cost = np.float64(1 / (2 * m) * cost)
return total_cost
def compute_gradient(x, y, w, b):
"""
Computes the gradient for linear regression
Args:
x (ndarray (m,)): Data, m examples
y (ndarray (m,)): target values
w,b (scalar) : model parameters
Returns
dj_dw (scalar): The gradient of the cost w.r.t. the parameters w
dj_db (scalar): The gradient of the cost w.r.t. the parameter b
"""
# Number of training examples
m = x.shape[0]
dj_dw = np.float64(0)
dj_db = np.float64(0)
for i in range(m):
f_wb = (w * x[i] + b)
dj_dw_i = ((f_wb - y[i]) * x[i])
dj_db_i = (f_wb - y[i])
dj_db += (dj_db_i)
dj_dw += (dj_dw_i)
dj_dw = dj_dw / m
dj_db = dj_db / m
return dj_dw, dj_db
def gradient_descent(x, y, w_in, b_in, alpha, num_iters, cost_function, gradient_function):
"""
Performs gradient descent to fit w,b. Updates w,b by taking
num_iters gradient steps with learning rate alpha
Args:
x (ndarray (m,)) : Data, m examples
y (ndarray (m,)) : target values
w_in,b_in (scalar): initial values of model parameters
alpha (float): Learning rate
num_iters (int): number of iterations to run gradient descent
cost_function: function to call to produce cost
gradient_function: function to call to produce gradient
Returns:
w (scalar): Updated value of parameter after running gradient descent
b (scalar): Updated value of parameter after running gradient descent
J_history (List): History of cost values
p_history (list): History of parameters [w,b]
"""
# Specify data type as np.float64 for w, b
w = np.float64(w_in)
b = np.float64(b_in)
# An array to store cost J and w's at each iteration primarily for graphing later
J_history = []
p_history = []
for i in range(num_iters):
# Calculate the gradient and update the parameters using gradient_function
dj_dw, dj_db = gradient_function(x, y, w , b)
# Update Parameters using equation (3) above
b = b - alpha * dj_db
w = np.float64(w - alpha * dj_dw)
# Save cost J at each iteration
J_history.append(cost_function(x, y, w, b))
p_history.append([w, b])
# Print cost every at intervals 10 times or as many iterations if < 10
if i % math.ceil(num_iters/100) == 0:
print(f"Iteration {i:4}: Cost {J_history[-1]:0.2e} ",
f"dj_dw: {dj_dw: 0.3e}, dj_db: {dj_db: 0.3e} ",
f"w: {w: 0.3e}, b: {b: 0.5e}")
return w, b, J_history, p_history # Return w and J,w history for graphing
# Initialize parameters with np.float64 data type
w_init = np.float64(0)
b_init = np.float64(0)
# Some gradient descent settings
iterations = 1000000
tmp_alpha = np.float64(1.0e-10)
# Run gradient descent
w_final, b_final, J_hist, p_hist = (gradient_descent(x_train, y_train, w_init, b_init, tmp_alpha,
iterations, compute_cost, compute_gradient))
# Print the result
print(f"(w, b) found by gradient descent: ({w_final}, {b_final})")
got the last result as:
Iteration 950000: Cost 2.24e-03 dj_dw: -9.486e-03, dj_db: 3.615e-03 w: 4.850e-04, b: 9.52354e-07
Iteration 960000: Cost 2.24e-03 dj_dw: -8.682e-03, dj_db: 3.617e-03 w: 4.850e-04, b: 9.48737e-07
Iteration 970000: Cost 2.24e-03 dj_dw: -7.946e-03, dj_db: 3.619e-03 w: 4.850e-04, b: 9.45119e-07
Iteration 980000: Cost 2.24e-03 dj_dw: -7.273e-03, dj_db: 3.621e-03 w: 4.850e-04, b: 9.41499e-07
Iteration 990000: Cost 2.24e-03 dj_dw: -6.657e-03, dj_db: 3.623e-03 w: 4.850e-04, b: 9.37877e-07
(w, b) found by gradient descent: (0.00048503387319465645, 9.34254408473887e-07)
descaled the y_train:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Assuming you have x_train, y_train (already scaled), w_final, and b_final
# Descale the y_train using the min-max scaling parameters (min and max)
min_y_train = np.min(y_train)
max_y_train = np.max(y_train)
y_train_descaled = y_train * (max - min) + min
# Compute the predicted values based on the descaled y_train
predictions = w_final * x_train + b_final
# Descale the predictions using the min-max scaling parameters (min and max)
predictions_descaled = predictions * (max - min) + min
# Plot the original x_train and descaled y_train
plt.scatter(x_train, y_train_descaled, label='Original Data')
# Plot the predicted values
plt.plot(x_train, predictions_descaled, color='red', label='Predicted Values')
plt.xlabel('x_train')
plt.ylabel('y_train')
plt.title('Descaled y_train vs. Predicted Values')
plt.legend()
plt.show()
made my input prediction:
prediction = w_final*1 + b_final
prediction_descaled = prediction * (max - min) + min
print(prediction_descaled)
which result in -0.11096116854066342 where it shouldnt even be a minus, if it was not scaled everything works fine but my cost reaches 9e18, i wanted to make it lower so that i can better present it
i think i messed up in the descaling process
*EDIT: i also tried scaling my x
mean = np.mean(x_train)
xmin = np.min(x_train)
xmax = np.max(x_train)
x_train = np.array([(i - xmin) / (xmax - xmin) for i in x_train])
and then plotted the descaled version of both:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Assuming you have x_train, y_train (already scaled), w_final, and b_final
# Descale the y_train using the min-max scaling parameters (min and max)
y_train_descaled = y_train * (ymax - ymin) + ymin
x_train_descaled = x_train * (xmax - xmin) + xmin
# Compute the predicted values based on the descaled y_train
predictions = w_final * x_train + b_final
# Descale the predictions using the min-max scaling parameters (min and max)
predictions_descaled = predictions * (ymax - ymin) + ymin
# Plot the original x_train and descaled y_train
plt.scatter(x_train_descaled, y_train_descaled, label='Original Data')
# Plot the predicted values
plt.plot(x_train_descaled, predictions_descaled, color='red', label='Predicted Values')
plt.xlabel('x_train')
plt.ylabel('y_train')
plt.title('Descaled y_train vs. Predicted Values')
plt.legend()
plt.show()
and tried to input a prediction of my own x(1):
scaled_input = (1 - xmin) / (xmax - xmin)
prediction = w_final*scaled_input + b_final
prediction_descaled = prediction * (ymax - ymin) + ymin
print(prediction_descaled)
but got a result of 92720747 which is too big because the non-scaled code outputted a more reasonable 33047038
Looking at the plot you provided, it seems like any x value below ~220 will result in a y value below 0. The negative value you're getting matches with what the red line shows.
There may be an issue with scaling. You're scaling y, but not x. I think it's usually more important to scale x. Assuming x is a matrix of shape samples x features
, then you can scale x as follows:
#Scaling training features
# x_train.ptp() calculates x.max() - x.min()
x_train_scaled = (x_train - x_train.min(axis=0)) / x_train.ptp(axis=0)
#fit model using x_train_scaled...
This will scale all the columns (features) in one go. Then, when you want to make a prediction with a new x, scale the new x using the training x values:
x_pred_scaled = (x_pred - x_train.min(axis=0)) / x_train.ptp(axis=0)
#y_pred = w X x_pred_scaled + intercept
What I described above is just for x. In the code you provided, you were scaling y as well, and the way you scaled/unscaled y looks correct.
If your cost is reaching 9e18 something may be going wrong, like almost dividing by zero. At the last iteration your cost was down to 2.24e-03 - please clarify where the 9e18 is from.