Search code examples
rloopsrstudioglmmtmb

Force do not occur R Session Aborted in RStudio


I'd like to create a loop for several GLMM models, but I know that some factor is not possible to fit. I create some parameters to avoid errors like skip_to_next <- FALSE, tryCatch, and a minimum of points restriction (if(length(unique(NEW_DS_F_pred_sub$DATE))>=4)). Despite all of these steps I always have R Session Aborted and I don't find any way to just ignore the not-so-good-to-fit factors.

In my example:

library("glmmTMB")
library("dplyr")
library("ggeffects")
library("ggplot2")

NEW_DS_F_pred <- NULL
STAND <- c(rep("A",5),rep("B",3),rep("C",6),rep("D",4))
stands <- unique(STAND)
DATE <- c("2022-01-01","2022-02-12","2022-03-01","2022-04-05","2022-06-01",
"2022-01-01","2022-02-12","2022-03-01",
"2022-01-01","2022-02-12","2022-03-01","2022-04-05","2022-06-01","2022-06-20",
"2022-01-01","2022-02-12","2022-03-01","2022-04-05")
B2_MAX <- runif(n=length(DATE))
B3_MAX <- runif(n=length(DATE))
B4_MAX <- runif(n=length(DATE))
NEW_DS_F_pred <- cbind(STAND,DATE,B2_MAX,B3_MAX,B4_MAX) %>% as.data.frame()

for (i in 1:length(stands)){
        skip_to_next <- FALSE
        tryCatch(print(stands[i]), error = function(e) { skip_to_next <<- TRUE})

NEW_DS_F_pred_sub <- NEW_DS_F_pred%>%filter(STAND==stands[i])

if(length(unique(NEW_DS_F_pred_sub$DATE))>=4){

NEW_DS_F_pred_sub$DATE_TIME <- as.numeric(difftime(NEW_DS_F_pred_sub$DATE, as.Date("2022-06-30"), units = "days"))
NEW_DS_F_pred_sub$DATE_TIME <- as.numeric(NEW_DS_F_pred_sub$DATE_TIME)
NEW_DS_F_pred_sub$B2_MAX <- as.numeric(NEW_DS_F_pred_sub$B2_MAX)
NEW_DS_F_pred_sub$B3_MAX <- as.numeric(NEW_DS_F_pred_sub$B3_MAX)
NEW_DS_F_pred_sub$B4_MAX <- as.numeric(NEW_DS_F_pred_sub$B4_MAX)
NEW_DS_F_pred_sub<-as.data.frame(NEW_DS_F_pred_sub)

# Fit the model B2_MAX
glmm_fit_B2_MAX <- glmmTMB(B2_MAX ~ poly(DATE_TIME,3) + (1|DATE_TIME), data=NEW_DS_F_pred_sub, family=tweedie(link = "log"))
ggeffects::ggpredict(glmm_fit_B2_MAX, terms = "DATE_TIME [all]") %>% plot(add.data = TRUE) + 
  xlab('Time in days') +
  ylab('VI 1')

# Predict the values
glmm_fit_B2_MAX_new <- NULL
glmm_fit_B2_MAX_new$DATE_TIME <- seq(-180,1)
glmm_fit_B2_MAX_new$B2_MAX <- predict(
  glmm_fit_B2_MAX,
  newdata = glmm_fit_B2_MAX_new,
  type = c("response"))
glmm_fit_B2_MAX_new$STAND <- rep(stands[i], length(glmm_fit_B2_MAX_new$DATE_TIME)) 
glmm_fit_B2_MAX_new <- as.data.frame(glmm_fit_B2_MAX_new)
glmm_fit_B2_MAX_new <- glmm_fit_B2_MAX_new%>%dplyr::select(STAND,DATE_TIME,B2_MAX)

# Fit the model B3_MAX
glmm_fit_B3_MAX <- glmmTMB(B3_MAX ~ poly(DATE_TIME,3) + (1|DATE_TIME), data=NEW_DS_F_pred_sub, family=tweedie(link = "log"))

# Predict the values
glmm_fit_B3_MAX_new <- NULL
glmm_fit_B3_MAX_new$DATE_TIME <- seq(-180,1)
glmm_fit_B3_MAX_new$B3_MAX <- predict(
  glmm_fit_B3_MAX,
  newdata = glmm_fit_B3_MAX_new,
  type = c("response"))
glmm_fit_B3_MAX_new$STAND <- rep(stands[i], length(glmm_fit_B3_MAX_new$DATE_TIME)) 
glmm_fit_B3_MAX_new <- as.data.frame(glmm_fit_B3_MAX_new)

# Fit the model B4_MAX
glmm_fit_B4_MAX <- glmmTMB(B4_MAX ~ poly(DATE_TIME,3) + (1|DATE_TIME), data=NEW_DS_F_pred_sub, family=tweedie(link = "log"))

# Predict the values
glmm_fit_B4_MAX_new <- NULL
glmm_fit_B4_MAX_new$DATE_TIME <- seq(-180,1)
glmm_fit_B4_MAX_new$B4_MAX <- predict(
  glmm_fit_B4_MAX,
  newdata = glmm_fit_B4_MAX_new,
  type = c("response"))
glmm_fit_B4_MAX_new$STAND <- rep(stands[i], length(glmm_fit_B4_MAX_new$DATE_TIME)) 
glmm_fit_B4_MAX_new <- as.data.frame(glmm_fit_B4_MAX_new)

if(skip_to_next) { next }     
}
}
#
#

boomRstudio

Is there any way to force the loop to continuos without R Session Aborted?

Thanks in advance!


Solution

  • If you can install the most recent development version of TMB via

    remotes::install_github("kaskr/adcomp/TMB")
    

    (you will need development tools installed), that will apply a bug fix and stop R from crashing.


    I made a function with your code inside, of the following form:

    do <- function(seed=NULL) {
        if (!is.null(seed)) {   
              cat("seed ", seed, "\n")
              set.seed(seed)
        }
        ## ... all of your code
    }
    

    And then ran

    for (i in 1:200) { 
       do(i+100)
    }
    

    at i==11 it terminated my R session with

    terminate called after throwing an instance of 'std::length_error'; what(): cannot create std::vector larger than max_size() Process R aborted (core dumped) at Fri Nov 3 20:17:28 2023

    Now I can run do(111) and get it to crash immediately.

    Now I can/will step through the code and/or run debug() on the function to see what aspects of the data for this random number seed are making the problem explode.


    After stepping through (with debug()), I found where the crash happened (on the fourth step (stand D), at the first model B2). I used

    saveRDS(NEW_DS_F_pred_sub, file = "SO77422084_bad.rds")
    

    to save the "bad" data frame (I've tried dput(), but it doesn't seem to work ...)

    Now this minimal code will crash R:

    library(glmmTMB)
    NEW_DS_F_pred_sub <- readRDS("SO77422084_bad.rds")
    ## trim data set to make it even more minimal
    bad <- NEW_DS_F_pred_sub[c("DATE_TIME", "B2_MAX")]
    glmm_fit_B2_MAX <- glmmTMB(B2_MAX ~ poly(DATE_TIME,3) + 
                                       (1|DATE_TIME), data=bad,
                                   family=tweedie(link = "log"))
    

    will crash R.

    A minimal example with a little bit more code, but that runs from scratch (i.e. not requiring us to run a bunch of code and save the results to an external file):

    set.seed(111)
    dd <- data.frame( STAND = rep(LETTERS[1:4], c(5,3,6,4)),
       DATE = c("2022-01-01","2022-02-12","2022-03-01","2022-04-05",
        "2022-06-01","2022-01-01","2022-02-12","2022-03-01",
        "2022-01-01","2022-02-12","2022-03-01","2022-04-05",
        "2022-06-01","2022-06-20","2022-01-01","2022-02-12",
                                  "2022-03-01","2022-04-05"))
        dd$B2_MAX <- runif(n=nrow(dd))
        dd_sub <- subset(dd, STAND == "D")
        dd_sub$DATE_TIME <-as.numeric(difftime(dd_sub$DATE, as.Date("2022-06-30"), units = "days"))
    glmmTMB(B2_MAX ~ poly(DATE_TIME,3) + 
                    (1|DATE_TIME), data=dd_sub,
                family=tweedie(link = "log"))
    

    Weirdly, running dput() on bad and redefining bad to that value:

    bad <- structure(list(DATE_TIME = c(-179.791666666667, -137.791666666667, 
                     -120.791666666667, -85.8333333333333),
             B2_MAX = c(0.156202515820041, 0.446427763439715, 
        0.171443687053397, 0.9665342932567)),
                         row.names = c(NA, -4L), class = "data.frame")
    

    will not crash R — there must some very subtle difference between the data set as stored in binary format and what happens when the ASCII representation is created ...

    I then did debug(glmmTMB) and stepped my way through glmmTMB. The crash happens while R is trying to run the optimization, at this point in the code.

    Debugging further, into nlminb, I find that the crash occurs here:

    .Call(C_port_nlminb, obj, grad, hess, rho, low, upp, d = rep_len(as.double(scale), 
        length(par)), iv, v)
    

    For what it's worth this also crashes if I switch optimizers by using

    control=glmmTMBControl(optimizer=optim, optArgs=list(method="BFGS"))
    

    so it must not be the optimizer's fault, but rather that some set of parameters encountered during optimization that triggers the crash.

    The next thing I'll do (not tonight!) is to figure out an appropriate way of tracing the parameters that are being tried by the optimizer so we can figure out the exact parameters that are causing the crash (and, hopefully, fix what looks like a bug in glmmTMB)

    See here for much more fussing over this bug.