So I set up my intitial splitting of the data and create a recipe. All the columns except for the role and outcome column are numeric
data_split <- initial_split(my_table_of_stuff,
strata = predict_path)
train_data <- training(data_split)
test_data <- testing(data_split)
set.seed(100)
cv_folds <-
vfold_cv(train_data,
v = 5,
strata = predict_path)
my_recip <- recipe(predict_path ~ ., data = train_data) %>%
update_role(sample_id, new_role = "ID") %>%
update_role(pathology, new_role = "ID") %>%
update_role(disease, new_role = "ID") %>%
update_role(tissue_clean, new_role = "ID") %>%
update_role(mutations, new_role = "ID") %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric(), -all_outcomes())
My specification for classification and the worflow
rf_spec <-
rand_forest() %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
rf_wflow <-
workflow() %>%
add_recipe(my_recip) %>%
add_model(rf_spec)
And then finally running the model on the cross-validation folds gets me
rf_res <-
rf_wflow %>%
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
recall, precision, f_meas,
accuracy, kap,
roc_auc, sens, spec),
control = control_resamples(save_pred = TRUE)
)
Error in `estimate_tune_results()`:
! All of the models failed. See the .notes column.
Run `rlang::last_error()` to see where the error occurred.
Warning message:
All models failed. Run `show_notes(.Last.tune.result)` for more information.
with a missing data error across the models
unique notes:
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Error: Missing data in columns: LRP8_1, DNAJC12_2, DNAJC12_1, KCNIP2_1, KNDC1_2, ...
I'm not coding for any categorical values here, but I am removing any zero variance variables, so, maybe that's it?
You are getting an error because the variables LRP8_1, DNAJC12_2, DNAJC12_1, KCNIP2_1, KNDC1_2, ...
contain one of more missing values, which the ranger
model doesn't support.
I like to use the skimr package to find what variables contain missing values.
Without being able to see what kind of data you have I can't advice how to deal with it, but {recipes} does have several steps that performs imputation.
Below is your reprex with fake data. Notice how it errors about num4
and num5
since they have missing values in them.
library(tidymodels)
my_table_of_stuff <- tibble(
predict_path = factor(sample(1:2, 100, TRUE)),
sample_id = sample(letters, 100, TRUE),
pathology = sample(letters, 100, TRUE),
disease = sample(letters, 100, TRUE),
tissue_clean = sample(letters, 100, TRUE),
mutations = sample(letters, 100, TRUE),
num1 = rnorm(100),
num2 = rnorm(100),
num3 = rnorm(100),
num4 = c(rnorm(99), NA),
num5 = c(rnorm(99), NA)
)
data_split <- initial_split(my_table_of_stuff,
strata = predict_path)
train_data <- training(data_split)
test_data <- testing(data_split)
set.seed(100)
cv_folds <-
vfold_cv(train_data,
v = 5,
strata = predict_path)
my_recip <- recipe(predict_path ~ ., data = train_data) %>%
update_role(sample_id, new_role = "ID") %>%
update_role(pathology, new_role = "ID") %>%
update_role(disease, new_role = "ID") %>%
update_role(tissue_clean, new_role = "ID") %>%
update_role(mutations, new_role = "ID") %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric(), -all_outcomes())
rf_spec <-
rand_forest() %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
rf_wflow <-
workflow() %>%
add_recipe(my_recip) %>%
add_model(rf_spec)
rf_res <-
rf_wflow %>%
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
recall, precision, f_meas,
accuracy, kap,
roc_auc, sens, spec),
control = control_resamples(save_pred = TRUE)
)
#> x Fold1: preprocessor 1/1, model 1/1: Error: Missing data in columns: num4, num5.
#> x Fold2: preprocessor 1/1, model 1/1: Error: Missing data in columns: num4, num5.
#> x Fold3: preprocessor 1/1, model 1/1: Error: Missing data in columns: num4, num5.
#> x Fold4: preprocessor 1/1, model 1/1 (predictions): Error: Missing data in columns: num4, num5.
#> x Fold5: preprocessor 1/1, model 1/1: Error: Missing data in columns: num4, num5.
#> Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
#> information.
Below I have taken the same code and data, and added step_impute_mean()
at the end of the recipe. And it now runs clean. Notice how I am able to use all_numeric_predictors()
instead of all_nuemric(), -all_outcomes()
.
library(tidymodels)
my_table_of_stuff <- tibble(
predict_path = factor(sample(1:2, 100, TRUE)),
sample_id = sample(letters, 100, TRUE),
pathology = sample(letters, 100, TRUE),
disease = sample(letters, 100, TRUE),
tissue_clean = sample(letters, 100, TRUE),
mutations = sample(letters, 100, TRUE),
num1 = rnorm(100),
num2 = rnorm(100),
num3 = rnorm(100),
num4 = c(rnorm(99), NA),
num5 = c(rnorm(99), NA)
)
data_split <- initial_split(my_table_of_stuff,
strata = predict_path)
train_data <- training(data_split)
test_data <- testing(data_split)
set.seed(100)
cv_folds <-
vfold_cv(train_data,
v = 5,
strata = predict_path)
my_recip <- recipe(predict_path ~ ., data = train_data) %>%
update_role(sample_id, new_role = "ID") %>%
update_role(pathology, new_role = "ID") %>%
update_role(disease, new_role = "ID") %>%
update_role(tissue_clean, new_role = "ID") %>%
update_role(mutations, new_role = "ID") %>%
step_zv(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_impute_mean(all_numeric_predictors())
rf_spec <-
rand_forest() %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
rf_wflow <-
workflow() %>%
add_recipe(my_recip) %>%
add_model(rf_spec)
rf_res <-
rf_wflow %>%
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
recall, precision, f_meas,
accuracy, kap,
roc_auc, sens, spec),
control = control_resamples(save_pred = TRUE)
)
rf_res
#> # Resampling results
#> # 5-fold cross-validation using stratification
#> # A tibble: 5 × 5
#> splits id .metrics .notes .predictions
#> <list> <chr> <list> <list> <list>
#> 1 <split [59/15]> Fold1 <tibble [8 × 4]> <tibble [0 × 3]> <tibble [15 × 6]>
#> 2 <split [59/15]> Fold2 <tibble [8 × 4]> <tibble [0 × 3]> <tibble [15 × 6]>
#> 3 <split [59/15]> Fold3 <tibble [8 × 4]> <tibble [0 × 3]> <tibble [15 × 6]>
#> 4 <split [59/15]> Fold4 <tibble [8 × 4]> <tibble [0 × 3]> <tibble [15 × 6]>
#> 5 <split [60/14]> Fold5 <tibble [8 × 4]> <tibble [0 × 3]> <tibble [14 × 6]>
Created on 2022-09-19 by the reprex package (v2.0.1)