r machine-learning logistic-regression tidymodels

Removing strata variable from predictions results

I built a prediction model using logistic regression which works well. But when I analyze the estimates calculated on the test dataset, I can see the variable I used to stratify the split comes up when I want it to be excluded of the model as a predictor. update_role() doesn't do that...

data_split <- initial_split(mldata, prop = 3/4, strata = strata_var)
# Create training and testing datasets:
train_data <- training(data_split)
test_data  <- testing(data_split)
# Build model
mldata_recipe <-
  recipe(vital ~ ., data = train_data)  %>% 
  update_role(ids, new_role = "ID") %>%
  update_role(strata_var, new_role = "strata") %>% 
  step_zv(all_predictors()) %>%
  step_unknown(all_nominal_predictors()) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_smote(vital)

set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = strata_var)

glmnet_spec <- 
  logistic_reg(penalty = tune(), mixture = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("glmnet") 

glmnet_workflow <- 
  workflow() %>% 
  add_recipe(mldata_recipe) %>% 
  add_model(glmnet_spec) 

glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0, 0.05, 
                                                                                      0.2, 0.4, 0.6, 0.8, 1)) 
set.seed(789)
glmnet_tune <- 
  tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid) 
final_glmnet <- glmnet_workflow %>% 
  finalize_workflow(select_best(glmnet_tune, "roc_auc")) 

glmnet_results <- final_glmnet %>% 
  fit_resamples(
    resamples = mldata_folds,
    metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
    control = control_resamples(save_pred = TRUE)
  )

set.seed(789)
final_fit <- final_glmnet %>% 
  last_fit(data_split)

final_fit %>% 
  pull(.workflow) %>% 
  pluck(1) %>% 
  tidy() %>% 
  filter(term != "(Intercept)") %>% 
  arrange(desc(abs(estimate))) %>% 
  filter(abs(estimate) >0) %>% 
  ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
  geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
  geom_point() + 
  scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
  theme_minimal()+
  ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")

In the last plot I can see the strata variable coming up.

Solution

You got this result because of the combination of role selection functions you used in step_dummy(). (full reprex at the end of post)

You used the following selections. Which selects all nominal, but not any outcomes. This selected the strata variables because it is both a nominal variable and not an outcome.

all_nominal(), -all_outcomes()

A better option would be to use all_nominal_predictors() which won't select id/strata variables.

library(tidymodels)
data("penguins")

rec_spec1 <- recipe(species ~ island + body_mass_g, data = penguins) %>% 
  update_role(island, new_role = "strata") %>% 
  step_dummy(all_nominal(), -all_outcomes())

rec_spec1 %>%
  prep() %>%
  bake(new_data = NULL)
#> # A tibble: 344 × 4
#>    body_mass_g species island_Dream island_Torgersen
#>          <int> <fct>          <dbl>            <dbl>
#>  1        3750 Adelie             0                1
#>  2        3800 Adelie             0                1
#>  3        3250 Adelie             0                1
#>  4          NA Adelie             0                1
#>  5        3450 Adelie             0                1
#>  6        3650 Adelie             0                1
#>  7        3625 Adelie             0                1
#>  8        4675 Adelie             0                1
#>  9        3475 Adelie             0                1
#> 10        4250 Adelie             0                1
#> # … with 334 more rows

rec_spec2 <- recipe(species ~ island + body_mass_g, data = penguins) %>% 
  update_role(island, new_role = "strata") %>% 
  step_dummy(all_nominal_predictors())

rec_spec2 %>%
  prep() %>%
  bake(new_data = NULL)
#> # A tibble: 344 × 3
#>    island    body_mass_g species
#>    <fct>           <int> <fct>  
#>  1 Torgersen        3750 Adelie 
#>  2 Torgersen        3800 Adelie 
#>  3 Torgersen        3250 Adelie 
#>  4 Torgersen          NA Adelie 
#>  5 Torgersen        3450 Adelie 
#>  6 Torgersen        3650 Adelie 
#>  7 Torgersen        3625 Adelie 
#>  8 Torgersen        4675 Adelie 
#>  9 Torgersen        3475 Adelie 
#> 10 Torgersen        4250 Adelie 
#> # … with 334 more rows

Full reprex

library(tidymodels)
library(themis)
library(forcats)
data("penguins")

penguins0 <- penguins %>%
  mutate(ids = row_number(),
         species = factor(species == "Adelie")) %>%
  drop_na()

data_split <- initial_split(penguins0, prop = 3/4, strata = island)
# Create training and testing datasets:
train_data <- training(data_split)
test_data  <- testing(data_split)
# Build model
mldata_recipe <-
  recipe(species ~ ., data = train_data)  %>% 
  update_role(ids, new_role = "ID") %>%
  update_role(island, new_role = "strata") %>% 
  step_zv(all_predictors()) %>%
  step_unknown(all_nominal_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>%
  step_smote(species)

set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = island)

glmnet_spec <- 
  logistic_reg(penalty = tune(), mixture = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("glmnet") 

glmnet_workflow <- 
  workflow() %>% 
  add_recipe(mldata_recipe) %>% 
  add_model(glmnet_spec) 

glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), 
                               mixture = c(0, 0.05, 0.2, 0.4, 0.6, 0.8, 1)) 
set.seed(789)
glmnet_tune <- 
  tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid) 
final_glmnet <- glmnet_workflow %>% 
  finalize_workflow(select_best(glmnet_tune, "roc_auc")) 

glmnet_results <- final_glmnet %>% 
  fit_resamples(
    resamples = mldata_folds,
    metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
    control = control_resamples(save_pred = TRUE)
  )

set.seed(789)
final_fit <- final_glmnet %>% 
  last_fit(data_split)

final_fit %>% 
  pull(.workflow) %>% 
  pluck(1) %>% 
  tidy() %>% 
  filter(term != "(Intercept)") %>% 
  arrange(desc(abs(estimate))) %>% 
  filter(abs(estimate) >0) %>% 
  ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
  geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
  geom_point() + 
  scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
  theme_minimal()+
  ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")

^{Created on 2021-08-20 by the reprex package (v2.0.1)}