Missing data in columns error during tidymodels classification with cvfolds

So I set up my intitial splitting of the data and create a recipe. All the columns except for the role and outcome column are numeric

data_split <- initial_split(my_table_of_stuff, 
                            strata = predict_path)

train_data <- training(data_split) 
test_data <- testing(data_split)

set.seed(100)

cv_folds <-
    vfold_cv(train_data, 
             v = 5, 
             strata = predict_path) 

my_recip <- recipe(predict_path ~ ., data = train_data) %>%
    update_role(sample_id, new_role = "ID") %>% 
    update_role(pathology, new_role = "ID") %>% 
    update_role(disease, new_role = "ID")  %>% 
    update_role(tissue_clean, new_role = "ID")  %>% 
    update_role(mutations, new_role = "ID")  %>% 
    step_zv(all_numeric(), -all_outcomes()) %>%
    step_normalize(all_numeric(), -all_outcomes())

My specification for classification and the worflow

rf_spec <- 
    rand_forest() %>% 
    set_engine("ranger", importance = "impurity") %>% 
    set_mode("classification")

rf_wflow <-
    workflow() %>%
    add_recipe(my_recip) %>% 
    add_model(rf_spec)

And then finally running the model on the cross-validation folds gets me

rf_res <-
    rf_wflow %>% 
    fit_resamples(
        resamples = cv_folds, 
        metrics = metric_set(
            recall, precision, f_meas, 
            accuracy, kap,
            roc_auc, sens, spec),
        control = control_resamples(save_pred = TRUE)
    ) 

Error in `estimate_tune_results()`:
! All of the models failed. See the .notes column.
Run `rlang::last_error()` to see where the error occurred.
Warning message:
All models failed. Run `show_notes(.Last.tune.result)` for more information.

with a missing data error across the models

unique notes:
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Error: Missing data in columns: LRP8_1, DNAJC12_2, DNAJC12_1, KCNIP2_1, KNDC1_2, ...

I'm not coding for any categorical values here, but I am removing any zero variance variables, so, maybe that's it?

Solution

You are getting an error because the variables LRP8_1, DNAJC12_2, DNAJC12_1, KCNIP2_1, KNDC1_2, ... contain one of more missing values, which the ranger model doesn't support.

I like to use the skimr package to find what variables contain missing values.

Without being able to see what kind of data you have I can't advice how to deal with it, but {recipes} does have several steps that performs imputation.

Below is your reprex with fake data. Notice how it errors about num4 and num5 since they have missing values in them.

library(tidymodels)

my_table_of_stuff <- tibble(
  predict_path = factor(sample(1:2, 100, TRUE)),
  sample_id = sample(letters, 100, TRUE),
  pathology = sample(letters, 100, TRUE),
  disease = sample(letters, 100, TRUE),
  tissue_clean = sample(letters, 100, TRUE),
  mutations = sample(letters, 100, TRUE),
  num1 = rnorm(100),
  num2 = rnorm(100),
  num3 = rnorm(100),
  num4 = c(rnorm(99), NA),
  num5 = c(rnorm(99), NA)
)

data_split <- initial_split(my_table_of_stuff, 
                            strata = predict_path)

train_data <- training(data_split) 
test_data <- testing(data_split)

set.seed(100)

cv_folds <-
  vfold_cv(train_data, 
           v = 5, 
           strata = predict_path) 

my_recip <- recipe(predict_path ~ ., data = train_data) %>%
  update_role(sample_id, new_role = "ID") %>% 
  update_role(pathology, new_role = "ID") %>% 
  update_role(disease, new_role = "ID")  %>% 
  update_role(tissue_clean, new_role = "ID")  %>% 
  update_role(mutations, new_role = "ID")  %>% 
  step_zv(all_numeric(), -all_outcomes()) %>%
  step_normalize(all_numeric(), -all_outcomes())

rf_spec <- 
  rand_forest() %>% 
  set_engine("ranger", importance = "impurity") %>% 
  set_mode("classification")

rf_wflow <-
  workflow() %>%
  add_recipe(my_recip) %>% 
  add_model(rf_spec) 

rf_res <-
  rf_wflow %>% 
  fit_resamples(
    resamples = cv_folds, 
    metrics = metric_set(
      recall, precision, f_meas, 
      accuracy, kap,
      roc_auc, sens, spec),
    control = control_resamples(save_pred = TRUE)
  ) 
#> x Fold1: preprocessor 1/1, model 1/1: Error: Missing data in columns: num4, num5.
#> x Fold2: preprocessor 1/1, model 1/1: Error: Missing data in columns: num4, num5.
#> x Fold3: preprocessor 1/1, model 1/1: Error: Missing data in columns: num4, num5.
#> x Fold4: preprocessor 1/1, model 1/1 (predictions): Error: Missing data in columns: num4, num5.
#> x Fold5: preprocessor 1/1, model 1/1: Error: Missing data in columns: num4, num5.
#> Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
#> information.

Below I have taken the same code and data, and added step_impute_mean() at the end of the recipe. And it now runs clean. Notice how I am able to use all_numeric_predictors() instead of all_nuemric(), -all_outcomes().

library(tidymodels)

my_table_of_stuff <- tibble(
  predict_path = factor(sample(1:2, 100, TRUE)),
  sample_id = sample(letters, 100, TRUE),
  pathology = sample(letters, 100, TRUE),
  disease = sample(letters, 100, TRUE),
  tissue_clean = sample(letters, 100, TRUE),
  mutations = sample(letters, 100, TRUE),
  num1 = rnorm(100),
  num2 = rnorm(100),
  num3 = rnorm(100),
  num4 = c(rnorm(99), NA),
  num5 = c(rnorm(99), NA)
)

data_split <- initial_split(my_table_of_stuff, 
                            strata = predict_path)

train_data <- training(data_split) 
test_data <- testing(data_split)

set.seed(100)

cv_folds <-
  vfold_cv(train_data, 
           v = 5, 
           strata = predict_path) 

my_recip <- recipe(predict_path ~ ., data = train_data) %>%
  update_role(sample_id, new_role = "ID") %>% 
  update_role(pathology, new_role = "ID") %>% 
  update_role(disease, new_role = "ID")  %>% 
  update_role(tissue_clean, new_role = "ID")  %>% 
  update_role(mutations, new_role = "ID")  %>% 
  step_zv(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_impute_mean(all_numeric_predictors())

rf_spec <- 
  rand_forest() %>% 
  set_engine("ranger", importance = "impurity") %>% 
  set_mode("classification")

rf_wflow <-
  workflow() %>%
  add_recipe(my_recip) %>% 
  add_model(rf_spec) 

rf_res <-
  rf_wflow %>% 
  fit_resamples(
    resamples = cv_folds, 
    metrics = metric_set(
      recall, precision, f_meas, 
      accuracy, kap,
      roc_auc, sens, spec),
    control = control_resamples(save_pred = TRUE)
  ) 

rf_res
#> # Resampling results
#> # 5-fold cross-validation using stratification 
#> # A tibble: 5 × 5
#>   splits          id    .metrics         .notes           .predictions     
#>   <list>          <chr> <list>           <list>           <list>           
#> 1 <split [59/15]> Fold1 <tibble [8 × 4]> <tibble [0 × 3]> <tibble [15 × 6]>
#> 2 <split [59/15]> Fold2 <tibble [8 × 4]> <tibble [0 × 3]> <tibble [15 × 6]>
#> 3 <split [59/15]> Fold3 <tibble [8 × 4]> <tibble [0 × 3]> <tibble [15 × 6]>
#> 4 <split [59/15]> Fold4 <tibble [8 × 4]> <tibble [0 × 3]> <tibble [15 × 6]>
#> 5 <split [60/14]> Fold5 <tibble [8 × 4]> <tibble [0 × 3]> <tibble [14 × 6]>

^{Created on 2022-09-19 by the reprex package (v2.0.1)}