Search code examples
rmachine-learninglinear-regressiongradient-descent

Why is my gradient descent for regression in R failing?


I have adapted the following gradient descent algorithm for regressing the y-variable stored in data[:,4] on the x-variable stored in data[:,1]. However, the gradient descent seems to be diverging. I would appreciate some help in identifying where I am going wrong.

#define the sum of squared residuals
ssquares <- function(x) 
  {
    t = 0
    for(i in 1:200)
      {
        t <- t + (data[i,4] - x[1] - x[2]*data[i,1])^2 
      }
    t/200
  }

# define the derivatives
derivative <- function(x) 
  {
    t1 = 0
    for(i in 1:200)
      {
        t1 <- t1 - 2*(data[i,4] - x[1] - x[2]*data[i,1]) 
      }
    t2 = 0
    for(i in 1:200)
      {
      t2 <- t2 - 2*data[i,1]*(data[i,4] - x[1] - x[2]*data[i,1]) 
      }
   c(t1/200,t2/200)
  }

# definition of the gradient descent method in 2D
gradient_descent <- function(func, derv, start, step=0.05, tol=1e-8) {
  pt1 <- start
  grdnt <- derv(pt1)
  pt2 <- c(pt1[1] - step*grdnt[1], pt1[2] - step*grdnt[2])
  while (abs(func(pt1)-func(pt2)) > tol) {
    pt1 <- pt2
    grdnt <- derv(pt1)
    pt2 <- c(pt1[1] - step*grdnt[1], pt1[2] - step*grdnt[2])
    print(func(pt2)) # print progress
  }
  pt2 # return the last point
}

# locate the minimum of the function using the Gradient Descent method
result <- gradient_descent(
  ssquares, # the function to optimize
  derivative, # the gradient of the function
  c(1,1), # start point of theplot_loss(simple_ex)  search 
  0.05, # step size (alpha)
  1e-8) # relative tolerance for one step

# display a summary of the results
print(result) # coordinate of fucntion minimum
print(ssquares(result)) # response of function minimum

Solution

  • You can vectorize your objective / gradient functions for faster implementation, as you can see it actually converges on the randomly generated data and the coefficients are pretty close to the ones obtained with lm() in R:

    ssquares <- function(x) {
      n <- nrow(data) # 200
      sum((data[,4] - cbind(1, data[,1]) %*% x)^2) / n
    }
    
    # define the derivatives
    derivative <- function(x) {
      n <- nrow(data) # 200
      c(sum(-2*(data[,4] - cbind(1, data[,1]) %*% x)), sum(-2*(data[,1])*(data[,4] - cbind(1, data[,1]) %*% x))) / n
    }
    
    set.seed(1)
    #data <- matrix(rnorm(800), nrow=200)
    
    # locate the minimum of the function using the Gradient Descent method
    result <- gradient_descent(
      ssquares, # the function to optimize
      derivative, # the gradient of the function
      c(1,1), # start point of theplot_loss(simple_ex)  search 
      0.05, # step size (alpha)
      1e-8) # relative tolerance for one step
    
    # [1] 2.511904
    # [1] 2.263448
    # [1] 2.061456
    # [1] 1.89721
    # [1] 1.763634
    # [1] 1.654984
    # [1] 1.566592
    # [1] 1.494668
    # ...
    
    # display a summary of the results
    print(result) # coefficients obtained with gradient descent
    #[1] -0.10248356  0.08068382
    
    lm(data[,4]~data[,1])$coef # coefficients from R lm()
    # (Intercept)   data[, 1] 
    # -0.10252181  0.08045722 
    
    # use new dataset, this time it takes quite sometime to converge, but the 
    # values GD converges to are pretty accurate as you can see from below.
    data <- read.csv('Advertising.csv') # with advertising data, removing the first rownames column
    
    # locate the minimum of the function using the Gradient Descent method
    result <- gradient_descent(
      ssquares, # the function to optimize
      derivative, # the gradient of the function
      c(1,1), # start point of theplot_loss(simple_ex)  search 
      0.00001, # step size (alpha), decreasing the learning rate
      1e-8) # relative tolerance for one step
    
    # ...
    # [1] 10.51364
    # [1] 10.51364
    # [1] 10.51364
    
    print(result) # coordinate of fucntion minimum
    [1] 6.97016852 0.04785365
    
    lm(data[,4]~data[,1])$coef
    (Intercept)   data[, 1] 
     7.03259355  0.04753664