R caret: train() failed for repeatedcv with factor predictors

The following function shall be used with Caret's train() function. Without any factor variables or without cross-validation it works fine. The problems appear when using factors as predictors and repeatedcv, because in the folds not all the factors are present but still appear within the factor levels:

Consider the following adapted cforest model (from the package partykit):

cforest_partykit <- list(label = "Conditional Inference Random Forest with partykit",
          library = c("partykit", "party"),
          loop = NULL,
          type = c("Classification", "Regression"),
          parameters = data.frame(parameter = 'mtry',
                                  class = 'numeric',
                                  label = "#Randomly Selected Predictors"),
          grid = function(x, y, len = NULL, search = "grid"){
            if(search == "grid") {
              out <- data.frame(mtry = caret::var_seq(p = ncol(x), 
                                                      classification = is.factor(y), 
                                                      len = len))
            } else {
              out <- data.frame(mtry = unique(sample(1:ncol(x), replace = TRUE, size = len)))
            }
            out
          },
          fit = function(x, y, wts, param, lev, last, classProbs, ...) {
            
             # make consistent factor levels
                if(any(sapply(x, is.factor))){                      
                  fac_col_names <- names(grep("factor", sapply(x, class), value=TRUE))
                  # assign present levels to each subset
                  for (i in 1:length(fac_col_names)) {                        
                    x[, which(names(x) == fac_col_names[i])] <- factor(x[, which(names(x) == fac_col_names[i])], 
                                                                       levels = as.character(unique(x[, which(names(x) == fac_col_names[i])])))                       
                  }              
                }
                 

            dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
            dat$.outcome <- y
            theDots <- list(...)
            
            if(any(names(theDots) == "mtry")) # # change controls to mtry?
            {
              theDots$mtry <- as.integer(param$mtry) # remove gtcrl 
              theDots$mtry
              theDots$mtry <- NULL
              
            } else mtry <- min(param$mtry, ncol(x))
            
            ## pass in any model weights
            if(!is.null(wts)) theDots$weights <- wts
            
            modelArgs <- c(list(formula = as.formula(.outcome ~ .),
                                data = dat,
                                mtry = mtry), # change controls to mtry?
                           theDots)
            
            out <- do.call(partykit::cforest, modelArgs)
            out
          },
          predict = function(modelFit, newdata = NULL, submodels = NULL) {
            if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)

            # make consistent factor levels
                if(any(sapply(newdata, is.factor))){                      
                  fac_col_names <- names(grep("factor", sapply(newdata, class), value=TRUE))
                  # assign present levels to each subset
                  for (i in 1:length(fac_col_names)) {                       
                    newdata[, which(names(newdata) == fac_col_names[i])] <- factor(newdata[, which(names(newdata) == fac_col_names[i])], 
                                                                       levels = as.character(unique(newdata[, which(names(newdata) == fac_col_names[i])])))                      
                  }                     
                }
                

            ## party builds the levels into the model object, so I'm
            ## going to assume that all the levels will be passed to
            ## the output
            out <- partykit:::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict_party, id?
            if(is.matrix(out)) out <- out[,1]
            if(!is.null(modelFit$'(response)')) out <- as.character(out) #  if(!is.null(modelFit@responses@levels$.outcome)) out <- as.character(out)
            
            out
          },
          prob = function(modelFit, newdata = NULL, submodels = NULL) { # submodels ?
            if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
            obsLevels <- levels(modelFit$'(response)')
            rawProbs <- partykit::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict(, type="prob) ? id?
            probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels), byrow = TRUE)
            out <- data.frame(probMatrix)
            colnames(out) <- obsLevels
            rownames(out) <- NULL
            out
          },
          predictors = function(x, ...) {
            vi <- partykit::varimp(x, ...)
            names(vi)[vi != 0]
          },
          varImp = function(object, ...) {
            variableImp <- partykit::varimp(object, ...)
            out <- data.frame(Overall = variableImp)
            out
          },
          tags = c("Random Forest", "Ensemble Model", "Bagging", "Implicit Feature Selection", "Accepts Case Weights"),
          levels = function(x) levels(x@data@get("response")[,1]),
          sort = function(x) x[order(x[,1]),],
          oob = function(x) {
            obs <- x@data@get("response")[,1]
            pred <- partykit:::predict.cforest(x, OOB = TRUE, newdata = NULL)
            postResample(pred, obs)
          })

When applying it within train and repeatedcv using a data frame with a factor predictor variable, an error occurs:

library(caret)
library(party)
library(partykit)

dat <- as.data.frame(ChickWeight)[1:20,]
dat$class <- as.factor(rep(letters[seq( from = 1, to = 20)], each=1))

# specifiy folds with CreateMultiFolds
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds_train <- caret::createMultiFolds(y = dat$weight,
                                   k = 3,   
                                   times = 2)

# specifiy trainControl for tuning mtry and with specified folds
finalcontrol <- caret::trainControl(search = "grid", method = "repeatedcv", number = 3, repeats = 2, 
                                    index = folds_train, 
                                    savePred = T)

preds <- dat[,2:5]
response <- dat[,1]

# tune hyperparameter mtry and build final model
tunegrid <- expand.grid(mtry=c(1,2,3,4)) 
#set.seed(42, kind = "Mersenne-Twister", normal.kind = "Inversion")
model <- caret::train(x = preds, # predictors
                      y = response, # response
                      method = cforest_partykit,
                      metric = "RMSE", 
                      tuneGrid = tunegrid, 
                      trControl = finalcontrol,
                      ntree = 150)

warnings()

1: predictions failed for Fold1.Rep1: mtry=1 Error in model.frame.default(object$predictf, data = newdata, na.action = na.pass, : factor class has new levels a, c, g, k, m, p, s, t

The aim is to identify the levels of each fold.rep and assign only those, which are present in the respective fold:

for (i in 1:length(folds_train)) {

  preds_temp <- preds[folds_train[[i]],]
  # check levels 
  levels(preds_temp$class)
  # which are actually present
  unique(preds_temp$class)
  # assign present levels to each subset
  preds_temp$class <- factor(preds_temp$class, levels = as.character(unique(preds_temp$class)))

}

I tried to include the assignment of the right factor levels within the cforest_partykit function (# make consistent factor levels), but it seems to have no effect.

How could I implement this in the caret train() or trainControl() or createDataPartition() function?

Solution

To make sure cforest_partykit treats categorical variables appropriately, it is best to create the design matrix explicitly through the model.matrix command.

For example

# Create a formula for the model
model_formula <- as.formula("y_column ~ . -1")

# Then create the design matrix
model_train.design.matrix <- model.matrix(model_formula, data = dat)

# Add in the y-variable
model_train.design.data <- cbind(y_column = data$y_column, model_train.design.matrix)