Search code examples
rtidymodels

tidymodels Novel levels found in column


I am using tidymodels to create a Random Forrest prediction. I have test data that contains a new factor level not present in the training data which results in the error:

1: Novel levels found in column 'Siblings': '4'. The levels have been removed, and values have been coerced to 'NA'. 
2: There are new levels in a factor: NA 
> test_predict
Fehler: Objekt 'test_predict' nicht gefunden

I tried to include a step_novel and step_dummy on the "Siblings" column but this does not resolve the error. How should I deal with new factors not present in training data?

library(tidyverse)
library(tidymodels)

data <-
  data.frame(
    Survived = as.factor(c(0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0)),
    Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,3)),
    Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
    Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s")) 
  )

test <-
  data.frame(
    Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,4)), #New factor level
    Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
    Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s")) 
  )

#Model
rf_model <-
  rand_forest() %>%
  set_args(
    mtry = 3,
    trees = 1000,
    min_n = 15
  ) %>%
  set_engine("ranger", 
             importance = "impurity") %>%
  set_mode("classification")

#Recipe
data_recipe <- 
  recipe(Survived ~Siblings + Class + Embarked, data=data) %>%
  step_novel(Siblings) %>%
  step_dummy(Siblings)
#Workflow
rf_workflow <- 
  workflow() %>%
  add_recipe(data_recipe) %>%
  add_model(rf_model)

final_model <- fit(rf_workflow, data)
final_model

test_predict <- predict(final_model, test)
test_predict

Solution

  • If you notice in the documentation for step_novel(), it says:

    When fitting a model that can deal with new factor levels, consider using workflows::add_recipe() with allow_novel_levels = TRUE set in hardhat::default_recipe_blueprint(). This will allow your model to handle new levels at prediction time, instead of throwing warnings or errors.

    So you want to do that:

    library(tidyverse)
    library(tidymodels)
    #> Registered S3 method overwritten by 'tune':
    #>   method                   from   
    #>   required_pkgs.model_spec parsnip
    
    data <-
      data.frame(
        Survived = as.factor(c(0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0)),
        Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,3)),
        Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
        Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s")) 
      )
    
    test <-
      data.frame(
        Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,4)), #New factor level
        Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
        Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s")) 
      )
    
    #Model
    rf_model <-
      rand_forest() %>%
      set_args(
        mtry = 3,
        trees = 1000,
        min_n = 15
      ) %>%
      set_engine("ranger", 
                 importance = "impurity") %>%
      set_mode("classification")
    
    #Recipe
    data_recipe <- 
      recipe(Survived ~Siblings + Class + Embarked, data=data) %>%
      step_novel(Siblings) %>%
      step_dummy(Siblings)
    
    #Workflow
    rf_workflow <- 
      workflow() %>%
      add_recipe(data_recipe, 
                 blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE)) %>%
      add_model(rf_model)
    
    final_model <- fit(rf_workflow, data)
    final_model
    #> ══ Workflow [trained] ══════════════════════════════════════════════════════════
    #> Preprocessor: Recipe
    #> Model: rand_forest()
    #> 
    #> ── Preprocessor ────────────────────────────────────────────────────────────────
    #> 2 Recipe Steps
    #> 
    #> • step_novel()
    #> • step_dummy()
    #> 
    #> ── Model ───────────────────────────────────────────────────────────────────────
    #> Ranger result
    #> 
    #> Call:
    #>  ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~3,      x), num.trees = ~1000, min.node.size = min_rows(~15, x),      importance = ~"impurity", num.threads = 1, verbose = FALSE,      seed = sample.int(10^5, 1), probability = TRUE) 
    #> 
    #> Type:                             Probability estimation 
    #> Number of trees:                  1000 
    #> Sample size:                      16 
    #> Number of independent variables:  5 
    #> Mtry:                             3 
    #> Target node size:                 15 
    #> Variable importance mode:         impurity 
    #> Splitrule:                        gini 
    #> OOB prediction error (Brier s.):  0.254242
    
    test_predict <- predict(final_model, test)
    test_predict
    #> # A tibble: 16 x 1
    #>    .pred_class
    #>    <fct>      
    #>  1 0          
    #>  2 1          
    #>  3 0          
    #>  4 1          
    #>  5 0          
    #>  6 0          
    #>  7 0          
    #>  8 0          
    #>  9 0          
    #> 10 1          
    #> 11 0          
    #> 12 1          
    #> 13 0          
    #> 14 0          
    #> 15 0          
    #> 16 0
    

    Created on 2021-07-09 by the reprex package (v2.0.0)

    The workflows functions are very strict about factor levels and other aspects of the new data, ensuring that they match up with the training data.