Search code examples
crlinear-regressionmcmc

Metropolis Hastings for linear regression model


I am trying to implement the Metropolis-Hastings algorithm for a simple linear regression in C (without use of other libraries (boost, Eigen etc.) and without two-dimensional arrays)*. For better testing of the code/evaluation of the trace plots, I have rewritten the code for R (see below) by keeping as much of the C-code as possible.

Unfortunately, the chains don't converge. I am wondering if

  1. there is a mistake in the implementation itself?
  2. "just" a bad choice of proposal distributions?

Assuming the latter, I am thinking about how to find good parameters of proposal distributions (currently I have picked arbitrary values) so that the algorithm works. Even with three parameters as in this case, it is quite hard to find suitable parameters. How does one normally handle this problem if say Gibbs sampling is not an alternative?

*I want to use this code for Cuda

#### posterior distribution
logPostDensity <- function(x, y, a, b, s2, N)
{
sumSqError = 0.0
for(i in 1:N)
{
  sumSqError = sumSqError + (y[i] - (a + b*x[i]))^2
}
return(((-(N/2)+1) * log(s2)) + ((-0.5/s2) * sumSqError))

}

# x = x values
# y = actual datapoints
# N = sample size
# m = length of chain
# sigmaProp = uniform proposal for sigma squared
# paramAProp = uniform proposal for intercept
# paramBProp = uniform proposal for slope

mcmcSampling <- function(x,y,N,m,sigmaProp,paramAProp,paramBProp)
{

  paramsA = vector("numeric",length=m) # intercept
  paramsB = vector("numeric",length=m) # slope
  s2 = vector("numeric",length=m) # sigma squared

  paramsA[1] = 0
  paramsB[1] = 0
  s2[1] = 1

  for(i in 2:m)
  {

    paramsA[i] = paramsA[i-1] + runif(1,-paramAProp,paramAProp)

    if((logPostDensity(x,y,paramsA[i],paramsB[i],s2[i-1],N)
        - logPostDensity(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N))
       < log(runif(1)))
    {
      paramsA[i] = paramsA[i-1]
    }

    paramsB[i] = paramsB[i-1] + runif(1,-paramBProp,paramBProp)

    if((logPostDensity(x,y,paramsA[i],paramsB[i],s2[i-1],N)
        - logPostDensity(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N))
       < log(runif(1)))
    {
      paramsB[i] = paramsB[i-1]
    }

    s2[i] = s2[i-1] + runif(1,-sigmaProp,sigmaProp)

    if((s2[i] < 0) || (logPostDensity(x,y,paramsA[i],paramsB[i],s2[i],N)
                       - logPostDensity(x,y,paramsA[i],paramsB[i],s2[i-1],N))
       < log(runif(1)))
    {
      s2[i] = s2[i-1]
    }


  }

  res = data.frame(paramsA,paramsB,s2)
  return(res)
}


#########################################

set.seed(321)
x <- runif(100)
y <- 2 + 5*x + rnorm(100)

summary(lm(y~x))


df <- mcmcSampling(x,y,10,5000,0.05,0.05,0.05)


par(mfrow=c(3,1))
plot(df$paramsA[3000:5000],type="l",main="intercept")
plot(df$paramsB[3000:5000],type="l",main="slope")
plot(df$s2[3000:5000],type="l",main="sigma")

Solution

  • There was one mistake in the intercept section (paramsA). Everything else was fine. I've implemented what Alexey suggested in his comments. Here's the solution:

    pow <- function(x,y)
    {
      return(x^y)
    }
    
    
    #### posterior distribution
    posteriorDistribution <- function(x, y, a, b,s2,N)
    {
    sumSqError <- 0.0
    for(i in 1:N)
    {
      sumSqError <- sumSqError + pow(y[i] - (a + b*x[i]),2)
    }
    return((-((N/2)+1) * log(s2)) + ((-0.5/s2) * sumSqError))
    
    }
    
    # x <- x values
    # y <- actual datapoints
    # N <- sample size
    # m <- length of chain
    # sigmaProposalWidth <- width of uniform proposal dist for sigma squared
    # paramAProposalWidth <- width of uniform proposal dist for intercept
    # paramBProposalWidth <- width of uniform proposal dist for slope
    
    mcmcSampling <- function(x,y,N,m,sigmaProposalWidth,paramAProposalWidth,paramBProposalWidth)
    {
    
      desiredAcc <- 0.44
    
      paramsA <- vector("numeric",length=m) # intercept
      paramsB <- vector("numeric",length=m) # slope
      s2 <- vector("numeric",length=m) # sigma squared
    
      paramsA[1] <- 0
      paramsB[1] <- 0
      s2[1] <- 1
    
      accATot <- 0
      accBTot <- 0
      accS2Tot <- 0
    
      for(i in 2:m)
      {
        paramsA[i] <- paramsA[i-1] + runif(1,-paramAProposalWidth,paramAProposalWidth)
        accA <- 1
        if((posteriorDistribution(x,y,paramsA[i],paramsB[i-1],s2[i-1],N) - 
            posteriorDistribution(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N)) < log(runif(1)))
        {
          paramsA[i] <- paramsA[i-1]
          accA <- 0
        }
    
    
        accATot <- accATot + accA
    
        paramsB[i] <- paramsB[i-1] + runif(1,-paramBProposalWidth,paramBProposalWidth)
        accB <- 1
        if((posteriorDistribution(x,y,paramsA[i],paramsB[i],s2[i-1],N) - 
            posteriorDistribution(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N)) < log(runif(1)))
        {
          paramsB[i] <- paramsB[i-1]
          accB <- 0
        }
    
        accBTot <- accBTot + accB
    
        s2[i] <- s2[i-1] + runif(1,-sigmaProposalWidth,sigmaProposalWidth)
        accS2 <- 1
    
        if((s2[i] < 0) || (posteriorDistribution(x,y,paramsA[i],paramsB[i],s2[i],N) - 
                           posteriorDistribution(x,y,paramsA[i],paramsB[i],s2[i-1],N)) < log(runif(1)))
        {
          s2[i] <- s2[i-1]
          accS2 <- 0
        }
    
        accS2Tot <- accS2Tot + accS2
    
        if(i%%100==0)
        {
    
          paramAProposalWidth <- paramAProposalWidth * ((accATot/100)/desiredAcc)
          paramBProposalWidth <- paramBProposalWidth * ((accBTot/100)/desiredAcc)
          sigmaProposalWidth <- sigmaProposalWidth * ((accS2Tot/100)/desiredAcc)
    
          accATot <-  0
          accBTot <-  0 
          accS2Tot <-  0
    
        }
    
    
      }
        res <- data.frame(paramsA,paramsB,s2)
        return(res)
    
    }