Search code examples
rmixed-modelsstatistics-bootstrap

R: using bootstrap prediction on mixed model


library(nlme)
library(bootstrap)
y = Loblolly$height
x = Loblolly
theta.fit = function(x, y){
nlme(height ~ SSasymp(age, Asym, R0, lrc),
            data = x,
            fixed = Asym + R0 + lrc ~ 1,
            random = Asym ~ 1,
            start = c(Asym = 103, R0 = -8.5, lrc = -3.3))
}
theta.predict = function(fit, x){
  (fit$fitted)[,1]
}
sq.err <- function(y,yhat) { (y-yhat)^2}                 
results <- bootpred(x,y,20,theta.fit,theta.predict,
                    err.meas=sq.err)

I am using the bootpred function to obtain estimates of prediction error. However, when I run the last line, I get the following error:

 Error in model.frame.default(formula = ~height + age, data = c(" 4.51",  : 
  'data' must be a data.frame, not a matrix or an array 

I then tried x = data.frame(x) but that did not solve my problem.


Solution

  • The problem comes about because the example dataset used is a groupedData:

    library(nlme)
    library(bootstrap)
    y = Loblolly$height
    x = Loblolly
    
    class(x)
    [1] "nfnGroupedData" "nfGroupedData"  "groupedData"    "data.frame" 
    

    And inside the bootpred function, it is converted into a matrix again. It can be quite a mess converting back and forth, especially when you need the factor column for linear mixed models.

    What you can do write theta.fit and theta.predict to take in a data.frame:

    theta.fit = function(df){
    nlme(height ~ SSasymp(age, Asym, R0, lrc),
                data = df,
                fixed = Asym + R0 + lrc ~ 1,
                random = Asym ~ 1,
                start = c(Asym = 103, R0 = -8.5, lrc = -3.3))
    }
    theta.predict = function(fit, df){
      predict(fit,df)
    }
    
    sq.err <- function(y,yhat) { (y-yhat)^2}
    

    And now alter the bootpred function and use df, I guess you can provide y again, or specific the column to use in the data.frame:

    bootpred_df = function (df,y,nboot, theta.fit, theta.predict, err.meas, ...) 
    {
        call <- match.call()
        n <- length(y)
        saveii <- NULL
        fit0 <- theta.fit(df, ...)
        yhat0 <- theta.predict(fit0, df)
        app.err <- mean(err.meas(y, yhat0))
        err1 <- matrix(0, nrow = nboot, ncol = n)
        err2 <- rep(0, nboot)
        for (b in 1:nboot) {
            ii <- sample(1:n, replace = TRUE)
            saveii <- cbind(saveii, ii)
            fit <- theta.fit(df[ii, ], ...)
            yhat1 <- theta.predict(fit, df[ii, ])
            yhat2 <- theta.predict(fit, df)
            err1[b, ] <- err.meas(y, yhat2)
            err2[b] <- mean(err.meas(y[ii], yhat1))
        }
        optim <- mean(apply(err1, 1, mean,na.rm=TRUE) - err2)
        junk <- function(x, i) {
            sum(x == i)
        }
        e0 <- 0
        for (i in 1:n) {
            o <- apply(saveii, 2, junk, i)
            if (sum(o == 0) == 0) 
                cat("increase nboot for computation of the .632 estimator", 
                    fill = TRUE)
            e0 <- e0 + (1/n) * sum(err1[o == 0, i])/sum(o == 0)
        }
        err.632 <- 0.368 * app.err + 0.632 * e0
        return(list(app.err, optim, err.632, call = call))
    }
    

    We can run it now.. but because of the nature of this data, there will be instances where the group (Seed) has an uneven distribution making some of the variables hard to estimate.. Most likely this problem might be better addressed by refining the code. In any case, if you are lucky it works like below:

    bootpred_df(Loblolly,Loblolly$height,20,theta.fit,theta.predict,err.meas=sq.err)
        [[1]]
        [1] 0.4337236
        
        [[2]]
        [1] 0.1777644
        
        [[3]]
        [1] 0.6532417
        
        $call
        bootpred_df(df = Loblolly, y = Loblolly$height, nboot = 20, theta.fit = theta.fit, 
            theta.predict = theta.predict, err.meas = sq.err)