I've tried to implement a gradient descent algorithm in Python for a machine learning problem. The dataset I'm working with has been preprocessed, and I observed an unexpected behavior in the runtime when comparing two datasets one with the original features and another with reduced dimensions of the former set using Singular Value Decomposition (SVD).I'm consistently observing that the runtime for the gradient descent algorithm is lower for the larger original dataset compared to the reduced one, contrary to my expectations. Shouldn't the runtime be shorter for the reduced dataset given its smaller size? I'm trying to understand why this is happening. Here's the relevant code snippets:
import time
import numpy as np
import matplotlib.pyplot as plt
def h_theta(X1, theta1):
# Implementation of hypothesis function
return np.dot(X1, theta1)
def j_theta(X1, y1, theta1):
# Implementation of cost function
return np.sum((h_theta(X1, theta1) - y1) ** 2) / (2 * X1.size)
def grad(X1, y1, theta):
# Calculation of gradient
gradient = np.dot(X1.T, h_theta(X1, theta) - y1) / len(y1)
return gradient
def gradient_descent(X1, y1):
theta_initial = np.zeros(X1.shape[1]) # Initialize theta with zeros
num_iterations = 1000
learning_rates = [0.1, 0.01, 0.001]
cost_iterations = []
theta_values = []
start = time.time()
for alpha in learning_rates:
theta = theta_initial.copy()
cost_history = []
for i in range(num_iterations):
gradient = grad(X1, y1, theta)
theta = theta - np.dot(alpha, gradient)
cost = j_theta(X1, y1, theta)
cost_history.append(cost)
cost_iterations.append(cost_history)
theta_values.append(theta)
end = time.time()
print(f"Time taken: {end - start} seconds")
fig, axs = plt.subplots(len(learning_rates), figsize=(8, 15))
for i, alpha in enumerate(learning_rates):
axs[i].plot(range(num_iterations), cost_iterations[i], label=f'alpha = {alpha}')
axs[i].set_title(f'Learning Rate: {alpha}')
axs[i].set_ylabel('Cost J')
axs[i].set_xlabel('Number of Iterations')
axs[i].legend()
plt.tight_layout()
plt.show()
# code to reduce X to 3 features (columns) using SVD:
# Perform Singular Value decomposition on X and reduce it to 3 columns
U, S, Vt = np.linalg.svd(X_normalized)
# Reduce X to 3 columns
X_reduced = np.dot(X_normalized, Vt[:3].T)
# print the first 5 rows of X_reduced
print("First 5 rows of X_reduced:")
# Normalize X_reduced
X_reduced = (X_reduced - np.mean(X_reduced, axis=0)) / np.std(X_reduced, axis=0)
print("the means and stds of X after being reduced and normalized:\n" ,X_reduced.mean(axis=0), X_reduced.std(axis=0))
# Print the shape of the reduced X to confirm it has only 3 features
print("Shape of X_reduced:", X_reduced.shape)
# Adding the intercept column to X_reduced
X_reduced_with_intercept = np.hstack((intercept_column, X_reduced))
# Example usage
# X_normalized_with_intercept and y_normalized represent the original dataset
# X_reduced_with_intercept and y_normalized represent the reduced dataset
# Performing gradient descent on the original dataset
gradient_descent(X_normalized_with_intercept, y_normalized)
# Performing gradient descent on the reduced dataset
gradient_descent(X_reduced_with_intercept, y_normalized)
What could be causing the reduced dataset to consistently have a longer runtime compared to the full dataset in my gradient descent implementation? Any insights or suggestions for troubleshooting would be greatly appreciated.
I've tried rewriting and reviewing my implementations, but it seems that for most learning rates, and for an increased amount of iterations, the running time of the larger feature set is lower than it's SVD subset.
It seems that the runtime of your code is dominated by book-keeping and looping operations rather than actual gradient descent. I've modified your code to avoid the outer for loop and allocate a numpy array and write to it instead of relying on slower lists:
import time
import numpy as np
import matplotlib.pyplot as plt
def h_theta(X1, theta1):
# Implementation of hypothesis function
return np.dot(X1, theta1)
def j_theta(X1, y1, theta1):
# Implementation of cost function
return np.sum((h_theta(X1, theta1) - y1) ** 2) / (2 * X1.size)
def grad(X1, y1, theta):
# Calculation of gradient
h = h_theta(X1, theta)
gradient = np.dot(X1.T, h - y1) / y1.shape[0]
return gradient
def gradient_descent(X1, y1):
learning_rates = [0.1, 0.01, 0.001]
num_iterations = 1000
num_learning_rates = len(learning_rates)
learning_rates = np.array(learning_rates).reshape(1, -1)
theta_initial = np.zeros((num_learning_rates, X1.shape[1])).T # Initialize theta with zeros
cost_iterations = np.zeros((num_learning_rates, num_iterations))
theta_values = np.zeros((num_learning_rates, X1.shape[1], 1))
theta = theta_initial.copy()
start = time.time()
#for idx, alpha in enumerate(learning_rates):
for i in range(num_iterations):
gradient = grad(X1, y1, theta)
theta = theta - learning_rates * gradient
cost = j_theta(X1, y1, theta)
cost_iterations[:, i] = cost
end = time.time()
print(f"Time taken: {end - start} seconds")
# fig, axs = plt.subplots(len(learning_rates), figsize=(8, 15))
# for i, alpha in enumerate(learning_rates):
# axs[i].plot(range(num_iterations), cost_iterations[i], label=f'alpha = {alpha}')
# axs[i].set_title(f'Learning Rate: {alpha}')
# axs[i].set_ylabel('Cost J')
# axs[i].set_xlabel('Number of Iterations')
# axs[i].legend()
# plt.tight_layout()
# plt.show()
X_normalized = np.random.randn(3072, 9)
y_normalized = np.random.randn(3072, 1).repeat(3, 1).reshape((3072, 3))
intercept_column = np.random.randn(3072, 1)
# code to reduce X to 3 features (columns) using SVD:
# Perform Singular Value decomposition on X and reduce it to 3 columns
U, S, Vt = np.linalg.svd(X_normalized)
# Reduce X to 3 columns
X_reduced = np.dot(X_normalized, Vt[:3].T)
# print the first 5 rows of X_reduced
print("First 5 rows of X_reduced:")
# Normalize X_reduced
X_reduced = (X_reduced - np.mean(X_reduced, axis=0)) / np.std(X_reduced, axis=0)
print("the means and stds of X after being reduced and normalized:\n" ,X_reduced.mean(axis=0), X_reduced.std(axis=0))
# Print the shape of the reduced X to confirm it has only 3 features
print("Shape of X_reduced:", X_reduced.shape)
# Adding the intercept column to X_reduced
X_reduced_with_intercept = np.hstack((intercept_column, X_reduced))
X_normalized_with_intercept = np.hstack((intercept_column, X_normalized))
# Example usage
# X_normalized_with_intercept and y_normalized represent the original dataset
# X_reduced_with_intercept and y_normalized represent the reduced dataset
# Performing gradient descent on the original dataset
gradient_descent(X_normalized_with_intercept, y_normalized)
# Performing gradient descent on the reduced dataset
gradient_descent(X_reduced_with_intercept, y_normalized)
Now, the code runs faster on the reduced dataset. I've filled in the blanks that you haven't specified so that I would be able to run your code. Hopefully I didn't butcher the gradient descent code too badly as I've relied on the dimensions making sense instead of working through the problem with a pen and paper.