Search code examples
rtidymodelsr-recipes

passing a list of variables to recipe in tidymodels causes model error


I have a simple recipe to train a model. My categorical variables are changing over time and sometimes I want a numerical to be treated as categorical (postal code) , so I define a list prior to recipe containing them. (just for the sake of the argument, the list is much longer)

recipe worked ok, and then trained my model (3 folds) but an error is raised.

 model_tuned$.notes
[[1]]
# A tibble: 1 x 1
  .notes                                                     
  <chr>                                                      
1 preprocessor 1/1: Error: object 'my_categorical' not found 

[[2]]
# A tibble: 1 x 1
  .notes                                                     
  <chr>                                                      
1 preprocessor 1/1: Error: object 'my_categorical' not found 

[[3]]
# A tibble: 1 x 1
  .notes                                                     
  <chr>                                                      
1 preprocessor 1/1: Error: object 'my_categorical' not found 

is there any proper way of passing a list of variables to a recipe not crashing the model?

REPREX

    library(recipes)
    library(magrittr)
    library(tidyverse)
    library(xgboost)
    library(tidymodels)
    
    mtcars1 <- mtcars
    
    
    mtcars1 %<>% dplyr::mutate(new1 = sample.int(200, 32, replace = TRUE),
                              new2 = sample.int(100, 32, replace = TRUE),
                              new3 = sample.int(50, 32, replace = TRUE))
    
    my_categorical <- c("new1", "new2", "new3")
    
    mtcars_split <- initial_split(mtcars1, strata = drat)
    train <- training(mtcars_split)
    test  <- testing(mtcars_split)
    
    recipe <-
      recipes::recipe(drat ~ ., data = train) %>%
      recipes::step_mutate_at(all_of(my_categorical), fn = ~as.character(.)) %>%
      recipes::step_string2factor(all_of(my_categorical)) %>% 
      prep()
    
    
    cv_folds <-
      vfold_cv(train, 
               v = 3, 
               strata = drat)
    
    
    xgboost_model <-
      parsnip::boost_tree(
        mode = "classification",
        trees = 100,
        min_n = tune(),
        tree_depth = tune(),
        learn_rate = tune(),
        loss_reduction = tune(),
        mtry = tune()
      ) %>%
      set_engine("xgboost") %>% 
      set_mode("classification")
    
    
    xgboost_workflow <-
      workflows::workflow() %>%
      add_recipe(recipe) %>% 
      add_model(xgboost_model) 
    
    
    xgboost_grid <-
      parameters(xgboost_model) %>%
      finalize(select(training(mtcars_split), -drat)) %>%
      grid_max_entropy(size = 100)
    
    
    model_metrics <- yardstick::metric_set(gain_capture,roc_auc)
    
    
    xgboost_tuned <-
      tune::tune_grid(
        object = xgboost_workflow,
        resamples = cv_folds,
        grid = xgboost_grid,
        metrics = model_metrics,
        control = tune::control_grid(save_pred = TRUE, save_workflow = TRUE)
      )

 xgboost_tuned$.notes
[[1]]
# A tibble: 1 x 1
  .notes                                                        
  <chr>                                                         
1 preprocessor 1/1: Error: object 'my_categorical' not found

[[2]]
# A tibble: 1 x 1
  .notes                                                        
  <chr>                                                         
1 preprocessor 1/1: Error: object 'my_categorical' not found

[[3]]
# A tibble: 1 x 1
  .notes                                                        
  <chr>                                                         
1 preprocessor 1/1: Error: object 'my_categorical' not found


sessioninfo::session_info()
- Session info -------------------------------------------------------------------------------
 setting  value                       
 version  R version 4.0.5 (2021-03-31)
 os       Windows 10 x64              
 system   x86_64, mingw32             
 ui       RStudio                     
 language (EN)                        
 collate  Spanish_Spain.1252          
 ctype    Spanish_Spain.1252          
 tz       Europe/Paris                
 date     2021-06-25                  

- Packages -----------------------------------------------------------------------------------
 package      * version    date       lib source                               
 askpass        1.1        2019-01-13 [1] CRAN (R 4.0.5)                       
 assertthat     0.2.1      2019-03-21 [1] CRAN (R 4.0.5)                       
 backports      1.2.1      2020-12-09 [1] CRAN (R 4.0.3)                       
 base64enc      0.1-3      2015-07-28 [1] CRAN (R 4.0.3)                       
 BBmisc         1.11       2017-03-10 [1] CRAN (R 4.0.5)                       
 broom        * 0.7.6      2021-04-05 [1] CRAN (R 4.0.5)                       
 butcher        0.1.4      2021-03-19 [1] CRAN (R 4.0.5)                       
 cachem         1.0.4      2021-02-13 [1] CRAN (R 4.0.5)                       
 cellranger     1.1.0      2016-07-27 [1] CRAN (R 4.0.5)                       
 checkmate      2.0.0      2020-02-06 [1] CRAN (R 4.0.5)                       
 class          7.3-18     2021-01-24 [2] CRAN (R 4.0.5)                       
 cli            2.5.0      2021-04-26 [1] CRAN (R 4.0.5)                       
 cluster        2.1.1      2021-02-14 [2] CRAN (R 4.0.5)                       
 codetools      0.2-18     2020-11-04 [2] CRAN (R 4.0.5)                       
 colorspace     2.0-1      2021-05-04 [1] CRAN (R 4.0.5)                       
 crayon         1.4.1      2021-02-08 [1] CRAN (R 4.0.5)                       
 credentials    1.3.0      2020-07-21 [1] CRAN (R 4.0.5)                       
 curl           4.3.1      2021-04-30 [1] CRAN (R 4.0.5)                       
 data.table     1.14.0     2021-02-21 [1] CRAN (R 4.0.5)                       
 DBI            1.1.1      2021-01-15 [1] CRAN (R 4.0.5)                       
 dbplyr         2.1.1      2021-04-06 [1] CRAN (R 4.0.5)                       
 dials        * 0.0.9      2020-09-16 [1] CRAN (R 4.0.5)                       
 DiceDesign     1.9        2021-02-13 [1] CRAN (R 4.0.5)                       
 digest         0.6.27     2020-10-24 [1] CRAN (R 4.0.5)                       
 doParallel     1.0.16     2020-10-16 [1] CRAN (R 4.0.5)                       
 dplyr        * 1.0.6      2021-05-05 [1] CRAN (R 4.0.3)                       
 ellipsis       0.3.2      2021-04-29 [1] CRAN (R 4.0.5)                       
 fansi          0.4.2      2021-01-15 [1] CRAN (R 4.0.5)                       
 fastmap        1.1.0      2021-01-25 [1] CRAN (R 4.0.5)                       
 fastmatch      1.1-0      2017-01-28 [1] CRAN (R 4.0.3)                       
 FNN            1.1.3      2019-02-15 [1] CRAN (R 4.0.5)                       
 forcats      * 0.5.1      2021-01-27 [1] CRAN (R 4.0.5)                       
 foreach        1.5.1      2020-10-15 [1] CRAN (R 4.0.5)                       
 foreign        0.8-81     2020-12-22 [2] CRAN (R 4.0.5)                       
 Formula        1.2-4      2020-10-16 [1] CRAN (R 4.0.3)                       
 fs             1.5.0      2020-07-31 [1] CRAN (R 4.0.5)                       
 furrr          0.2.2      2021-01-29 [1] CRAN (R 4.0.5)                       
 future         1.21.0     2020-12-10 [1] CRAN (R 4.0.5)                       
 generics       0.1.0      2020-10-31 [1] CRAN (R 4.0.5)                       
 gert           1.3.0      2021-03-29 [1] CRAN (R 4.0.5)                       
 ggplot2      * 3.3.3      2020-12-30 [1] CRAN (R 4.0.5)                       
 globals        0.14.0     2020-11-22 [1] CRAN (R 4.0.3)                       
 glue           1.4.2      2020-08-27 [1] CRAN (R 4.0.5)                       
 gower          0.2.2      2020-06-23 [1] CRAN (R 4.0.3)                       
 GPfit          1.0-8      2019-02-08 [1] CRAN (R 4.0.5)                       
 gridExtra      2.3        2017-09-09 [1] CRAN (R 4.0.5)                       
 gtable         0.3.0      2019-03-25 [1] CRAN (R 4.0.5)                       
 hardhat        0.1.5      2020-11-09 [1] CRAN (R 4.0.5)                       
 haven          2.4.1      2021-04-23 [1] CRAN (R 4.0.5)                       
 Hmisc          4.5-0      2021-02-28 [1] CRAN (R 4.0.5)                       
 hms            1.1.0      2021-05-17 [1] CRAN (R 4.0.5)                       
 htmlTable      2.1.0      2020-09-16 [1] CRAN (R 4.0.5)                       
 htmltools      0.5.1.1    2021-01-22 [1] CRAN (R 4.0.5)                       
 htmlwidgets    1.5.3      2020-12-10 [1] CRAN (R 4.0.5)                       
 httr           1.4.2      2020-07-20 [1] CRAN (R 4.0.5)                       
 infer        * 0.5.4      2021-01-13 [1] CRAN (R 4.0.5)                       
 ipred          0.9-11     2021-03-12 [1] CRAN (R 4.0.5)                       
 iterators      1.0.13     2020-10-15 [1] CRAN (R 4.0.5)                       
 jpeg           0.1-8.1    2019-10-24 [1] CRAN (R 4.0.3)                       
 jsonlite       1.7.2      2020-12-09 [1] CRAN (R 4.0.5)                       
 knitr          1.33       2021-04-24 [1] CRAN (R 4.0.5)                       
 lattice        0.20-41    2020-04-02 [2] CRAN (R 4.0.5)                       
 latticeExtra   0.6-29     2019-12-19 [1] CRAN (R 4.0.5)                       
 lava           1.6.9      2021-03-11 [1] CRAN (R 4.0.5)                       
 lhs            1.1.1      2020-10-05 [1] CRAN (R 4.0.5)                       
 lifecycle      1.0.0      2021-02-15 [1] CRAN (R 4.0.5)                       
 listenv        0.8.0      2019-12-05 [1] CRAN (R 4.0.5)                       
 lubridate      1.7.10     2021-02-26 [1] CRAN (R 4.0.5)                       
 magrittr     * 2.0.1      2020-11-17 [1] CRAN (R 4.0.5)                       
 MASS           7.3-53.1   2021-02-12 [2] CRAN (R 4.0.5)                       
 Matrix         1.3-2      2021-01-06 [2] CRAN (R 4.0.5)                       
 memoise        2.0.0      2021-01-26 [1] CRAN (R 4.0.5)                       
 memuse         4.1-0      2020-02-17 [1] CRAN (R 4.0.3)                       
 mlr            2.19.0     2021-02-22 [1] CRAN (R 4.0.5)                       
 modeldata    * 0.1.0      2020-10-22 [1] CRAN (R 4.0.5)                       
 modelr         0.1.8      2020-05-19 [1] CRAN (R 4.0.5)                       
 munsell        0.5.0      2018-06-12 [1] CRAN (R 4.0.5)                       
 nnet           7.3-15     2021-01-24 [2] CRAN (R 4.0.5)                       
 openssl        1.4.4      2021-04-30 [1] CRAN (R 4.0.5)                       
 openxlsx       4.2.3      2020-10-27 [1] CRAN (R 4.0.5)                       
 pacman       * 0.5.1      2019-03-11 [1] CRAN (R 4.0.5)                       
 parallelly     1.25.0     2021-04-30 [1] CRAN (R 4.0.5)                       
 parallelMap    1.5.0      2020-03-26 [1] CRAN (R 4.0.5)                       
 ParamHelpers   1.14       2020-03-24 [1] CRAN (R 4.0.5)                       
 parsnip      * 0.1.5      2021-01-19 [1] CRAN (R 4.0.5)                       
 pillar         1.6.1      2021-05-16 [1] CRAN (R 4.0.5)                       
 pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.0.5)                       
 plyr           1.8.6      2020-03-03 [1] CRAN (R 4.0.5)                       
 png            0.1-7      2013-12-03 [1] CRAN (R 4.0.3)                       
 prettycode     1.1.0      2019-12-16 [1] CRAN (R 4.0.5)                       
 pROC           1.17.0.1   2021-01-13 [1] CRAN (R 4.0.5)                       
 prodlim        2019.11.13 2019-11-17 [1] CRAN (R 4.0.5)                       
 prompt         1.0.1      2021-03-12 [1] CRAN (R 4.0.5)                       
 purrr        * 0.3.4      2020-04-17 [1] CRAN (R 4.0.5)                       
 R6             2.5.0      2020-10-28 [1] CRAN (R 4.0.5)                       
 RANN           2.6.1      2019-01-08 [1] CRAN (R 4.0.5)                       
 rappdirs       0.3.3      2021-01-31 [1] CRAN (R 4.0.5)                       
 RColorBrewer   1.1-2      2014-12-07 [1] CRAN (R 4.0.3)                       
 Rcpp           1.0.6      2021-01-15 [1] CRAN (R 4.0.5)                       
 readr        * 1.4.0      2020-10-05 [1] CRAN (R 4.0.5)                       
 readxl         1.3.1      2019-03-13 [1] CRAN (R 4.0.5)                       
 recipes      * 0.1.16     2021-04-16 [1] CRAN (R 4.0.5)                       
 remotes        2.3.0      2021-04-01 [1] CRAN (R 4.0.5)                       
 reprex         2.0.0      2021-04-02 [1] CRAN (R 4.0.5)                       
 rio            0.5.26     2021-03-01 [1] CRAN (R 4.0.5)                       
 rlang        * 0.4.11     2021-04-30 [1] CRAN (R 4.0.5)                       
 ROSE           0.0-3      2014-07-15 [1] CRAN (R 4.0.5)                       
 rpart          4.1-15     2019-04-12 [2] CRAN (R 4.0.5)                       
 rprofile       0.1.7      2021-05-10 [1] Github (csgillespie/rprofile@61dca21)
 rsample      * 0.1.0      2021-05-08 [1] CRAN (R 4.0.3)                       
 rsthemes       0.2.1.9000 2021-05-13 [1] Github (gadenbuie/rsthemes@19299e5)  
 rstudioapi     0.13       2020-11-12 [1] CRAN (R 4.0.5)                       
 rvest          1.0.0      2021-03-09 [1] CRAN (R 4.0.5)                       
 scales       * 1.1.1      2020-05-11 [1] CRAN (R 4.0.5)                       
 sessioninfo    1.1.1      2018-11-05 [1] CRAN (R 4.0.5)                       
 stringi        1.5.3      2020-09-09 [1] CRAN (R 4.0.3)                       
 stringr      * 1.4.0      2019-02-10 [1] CRAN (R 4.0.5)                       
 survival       3.2-10     2021-03-16 [2] CRAN (R 4.0.5)                       
 sys            3.4        2020-07-23 [1] CRAN (R 4.0.5)                       
 themis         0.1.3      2020-11-12 [1] CRAN (R 4.0.5)                       
 tibble       * 3.1.1      2021-04-18 [1] CRAN (R 4.0.5)                       
 tidymodels   * 0.1.3      2021-04-19 [1] CRAN (R 4.0.5)                       
 tidyr        * 1.1.3      2021-03-03 [1] CRAN (R 4.0.5)                       
 tidyselect     1.1.1      2021-04-30 [1] CRAN (R 4.0.5)                       
 tidyverse    * 1.3.1      2021-04-15 [1] CRAN (R 4.0.5)                       
 timeDate       3043.102   2018-02-21 [1] CRAN (R 4.0.5)                       
 tune         * 0.1.5      2021-04-23 [1] CRAN (R 4.0.5)                       
 unbalanced     2.0        2015-06-26 [1] CRAN (R 4.0.5)                       
 usethis        2.0.1      2021-02-10 [1] CRAN (R 4.0.5)                       
 utf8           1.2.1      2021-03-12 [1] CRAN (R 4.0.5)                       
 vctrs        * 0.3.8      2021-04-29 [1] CRAN (R 4.0.5)                       
 withr          2.4.2      2021-04-18 [1] CRAN (R 4.0.5)                       
 workflows    * 0.2.2      2021-03-10 [1] CRAN (R 4.0.5)                       
 workflowsets * 0.0.2      2021-04-16 [1] CRAN (R 4.0.5)                       
 xaringan       0.20       2021-03-04 [1] CRAN (R 4.0.5)                       
 xfun           0.22       2021-03-11 [1] CRAN (R 4.0.5)                       
 xgboost      * 1.4.1.1    2021-04-22 [1] CRAN (R 4.0.5)                       
 xml2           1.3.2      2020-04-23 [1] CRAN (R 4.0.5)                       
 yardstick    * 0.0.8      2021-03-28 [1] CRAN (R 4.0.5)                       
 zip            2.1.1      2020-08-27 [1] CRAN (R 4.0.5)                       

[1] C:/Users/Joe/R/win-library/4.0
[2] C:/Program Files/R/R-4.0.5/library

Solution

  • You definitely were passing the vector of variables correctly to the recipe -- no problem there!

    You were running into other problems with your model fitting. An xgboost model requires all predictors to be numeric, so if you convert something like zip code to factors, you need to then use step_dummy(). If you have something of high cardinality like zip codes, you probably will need to handle new levels or unknown levels as well.

    library(magrittr)
    library(tidyverse)
    library(tidymodels)
    #> Registered S3 method overwritten by 'tune':
    #>   method                   from   
    #>   required_pkgs.model_spec parsnip
    
    mtcars1 <- mtcars
    mtcars1 %<>% dplyr::mutate(new1 = sample.int(10, 32, replace = TRUE),
                               new2 = sample.int(5, 32, replace = TRUE))
    
    my_categorical <- c("new1", "new2")
    
    mtcars_split <- initial_split(mtcars1)
    train <- training(mtcars_split)
    test  <- testing(mtcars_split)
    cv_folds <- vfold_cv(train, v = 3)
    
    
    rec <-
      recipe(drat ~ ., data = train) %>%
      step_mutate_at(all_of(my_categorical), fn = ~as.character(.)) %>%
      step_string2factor(all_of(my_categorical)) %>%
      step_novel(all_nominal_predictors()) %>%
      step_unknown(all_nominal_predictors()) %>%
      step_dummy(all_nominal_predictors())
    
    xgboost_model <-
      boost_tree(
        mode = "classification",
        trees = tune()
      ) %>%
      set_engine("xgboost") %>% 
      set_mode("regression")
    
    
    xgboost_workflow <-
      workflow() %>%
      add_recipe(rec) %>% 
      add_model(xgboost_model)
    
    tune_grid(
        object = xgboost_workflow,
        resamples = cv_folds,
        grid = 5
      )
    #> # Tuning results
    #> # 3-fold cross-validation 
    #> # A tibble: 3 x 4
    #>   splits         id    .metrics          .notes          
    #>   <list>         <chr> <list>            <list>          
    #> 1 <split [16/8]> Fold1 <tibble [10 × 5]> <tibble [0 × 1]>
    #> 2 <split [16/8]> Fold2 <tibble [10 × 5]> <tibble [0 × 1]>
    #> 3 <split [16/8]> Fold3 <tibble [10 × 5]> <tibble [0 × 1]>
    

    Created on 2021-06-25 by the reprex package (v2.0.0)

    I had to change a few other things in your example to get this to run, like using "regression" since drat is numeric, etc. I recommend checking out the reprex package so you run an example like this in a fresh R session and more effectively get help.