Search code examples
rmachine-learninglogistic-regressiontidymodels

Removing strata variable from predictions results


I built a prediction model using logistic regression which works well. But when I analyze the estimates calculated on the test dataset, I can see the variable I used to stratify the split comes up when I want it to be excluded of the model as a predictor. update_role() doesn't do that...

data_split <- initial_split(mldata, prop = 3/4, strata = strata_var)
# Create training and testing datasets:
train_data <- training(data_split)
test_data  <- testing(data_split)
# Build model
mldata_recipe <-
  recipe(vital ~ ., data = train_data)  %>% 
  update_role(ids, new_role = "ID") %>%
  update_role(strata_var, new_role = "strata") %>% 
  step_zv(all_predictors()) %>%
  step_unknown(all_nominal_predictors()) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_smote(vital)

set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = strata_var)

glmnet_spec <- 
  logistic_reg(penalty = tune(), mixture = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("glmnet") 

glmnet_workflow <- 
  workflow() %>% 
  add_recipe(mldata_recipe) %>% 
  add_model(glmnet_spec) 

glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0, 0.05, 
                                                                                      0.2, 0.4, 0.6, 0.8, 1)) 
set.seed(789)
glmnet_tune <- 
  tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid) 
final_glmnet <- glmnet_workflow %>% 
  finalize_workflow(select_best(glmnet_tune, "roc_auc")) 

glmnet_results <- final_glmnet %>% 
  fit_resamples(
    resamples = mldata_folds,
    metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
    control = control_resamples(save_pred = TRUE)
  )

set.seed(789)
final_fit <- final_glmnet %>% 
  last_fit(data_split)

final_fit %>% 
  pull(.workflow) %>% 
  pluck(1) %>% 
  tidy() %>% 
  filter(term != "(Intercept)") %>% 
  arrange(desc(abs(estimate))) %>% 
  filter(abs(estimate) >0) %>% 
  ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
  geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
  geom_point() + 
  scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
  theme_minimal()+
  ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")

In the last plot I can see the strata variable coming up.


Solution

  • You got this result because of the combination of role selection functions you used in step_dummy(). (full reprex at the end of post)

    You used the following selections. Which selects all nominal, but not any outcomes. This selected the strata variables because it is both a nominal variable and not an outcome.

    all_nominal(), -all_outcomes()
    

    A better option would be to use all_nominal_predictors() which won't select id/strata variables.

    library(tidymodels)
    data("penguins")
    
    rec_spec1 <- recipe(species ~ island + body_mass_g, data = penguins) %>% 
      update_role(island, new_role = "strata") %>% 
      step_dummy(all_nominal(), -all_outcomes())
    
    rec_spec1 %>%
      prep() %>%
      bake(new_data = NULL)
    #> # A tibble: 344 × 4
    #>    body_mass_g species island_Dream island_Torgersen
    #>          <int> <fct>          <dbl>            <dbl>
    #>  1        3750 Adelie             0                1
    #>  2        3800 Adelie             0                1
    #>  3        3250 Adelie             0                1
    #>  4          NA Adelie             0                1
    #>  5        3450 Adelie             0                1
    #>  6        3650 Adelie             0                1
    #>  7        3625 Adelie             0                1
    #>  8        4675 Adelie             0                1
    #>  9        3475 Adelie             0                1
    #> 10        4250 Adelie             0                1
    #> # … with 334 more rows
    
    rec_spec2 <- recipe(species ~ island + body_mass_g, data = penguins) %>% 
      update_role(island, new_role = "strata") %>% 
      step_dummy(all_nominal_predictors())
    
    rec_spec2 %>%
      prep() %>%
      bake(new_data = NULL)
    #> # A tibble: 344 × 3
    #>    island    body_mass_g species
    #>    <fct>           <int> <fct>  
    #>  1 Torgersen        3750 Adelie 
    #>  2 Torgersen        3800 Adelie 
    #>  3 Torgersen        3250 Adelie 
    #>  4 Torgersen          NA Adelie 
    #>  5 Torgersen        3450 Adelie 
    #>  6 Torgersen        3650 Adelie 
    #>  7 Torgersen        3625 Adelie 
    #>  8 Torgersen        4675 Adelie 
    #>  9 Torgersen        3475 Adelie 
    #> 10 Torgersen        4250 Adelie 
    #> # … with 334 more rows
    

    Full reprex

    library(tidymodels)
    library(themis)
    library(forcats)
    data("penguins")
    
    penguins0 <- penguins %>%
      mutate(ids = row_number(),
             species = factor(species == "Adelie")) %>%
      drop_na()
    
    data_split <- initial_split(penguins0, prop = 3/4, strata = island)
    # Create training and testing datasets:
    train_data <- training(data_split)
    test_data  <- testing(data_split)
    # Build model
    mldata_recipe <-
      recipe(species ~ ., data = train_data)  %>% 
      update_role(ids, new_role = "ID") %>%
      update_role(island, new_role = "strata") %>% 
      step_zv(all_predictors()) %>%
      step_unknown(all_nominal_predictors()) %>% 
      step_dummy(all_nominal_predictors()) %>%
      step_smote(species)
    
    set.seed(456)
    # 10 fold cross validation
    mldata_folds <- vfold_cv(train_data, strata = island)
    
    glmnet_spec <- 
      logistic_reg(penalty = tune(), mixture = tune()) %>% 
      set_mode("classification") %>% 
      set_engine("glmnet") 
    
    glmnet_workflow <- 
      workflow() %>% 
      add_recipe(mldata_recipe) %>% 
      add_model(glmnet_spec) 
    
    glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), 
                                   mixture = c(0, 0.05, 0.2, 0.4, 0.6, 0.8, 1)) 
    set.seed(789)
    glmnet_tune <- 
      tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid) 
    final_glmnet <- glmnet_workflow %>% 
      finalize_workflow(select_best(glmnet_tune, "roc_auc")) 
    
    glmnet_results <- final_glmnet %>% 
      fit_resamples(
        resamples = mldata_folds,
        metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
        control = control_resamples(save_pred = TRUE)
      )
    
    set.seed(789)
    final_fit <- final_glmnet %>% 
      last_fit(data_split)
    
    final_fit %>% 
      pull(.workflow) %>% 
      pluck(1) %>% 
      tidy() %>% 
      filter(term != "(Intercept)") %>% 
      arrange(desc(abs(estimate))) %>% 
      filter(abs(estimate) >0) %>% 
      ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
      geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
      geom_point() + 
      scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
      theme_minimal()+
      ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
    

    Created on 2021-08-20 by the reprex package (v2.0.1)