Getting SHAP values for XGB model with workflowsets in Tidymodels

I'm trying to pass my model and the feature matrix to SHAPforxgboost but I'm having issues since I'm using a tunable recipe and model.

Most examples including this one showing how tidymodels interfaces with SHAPforxgboost have a step that requires prep() and bake() but this is not possible with a tunable recipe which is what I'm using.

Any guidance that can be shared would be greatly appreciated.

# LOAD PACKAGES
if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  tidymodels,
  tidyverse,
  doParallel,
  janitor,
  AmesHousing,
  vip,
  randomForest,
  finetune,
  lightgbm,
  pdp
)

# load the housing data and clean names
ames_data <- make_ames() %>%
  janitor::clean_names()

# split into training and testing datasets. Stratify by Sale price
ames_split <- rsample::initial_split(
  ames_no_outliers,
  prop = 0.8,
  strata = sale_price
)

# CREATE TRAINING AND TESTING OBJECTS FROM THE SPLIT OBJECT
ames_train <- training(ames_split)
ames_test <- testing(ames_split)

# CREATE RESAMPLES TO CHOOSE AND COMPARE MODELS
set.seed(234)
ames_folds <- vfold_cv(ames_train, strata = sale_price, v = 5)

# BASE RECIPE
base_rec <-
  recipe(sale_price ~ ., data = ames_train) %>%
  step_log(sale_price, base = 10) %>%
  step_YeoJohnson(lot_area, gr_liv_area) %>%
  step_other(neighborhood, threshold = .1) %>%
  step_dummy(all_nominal()) %>%
  step_zv(all_predictors()) %>%
  step_ns(latitude, longitude, deg_free = tune())

# RECIPES WITH LOG TRANSFORMATIONS
bt_rec <-
  recipe(sale_price ~ overall_qual + gr_liv_area + bsmt_qual + garage_cars + garage_area + year_built + total_bsmt_sf + exter_qual + first_flr_sf + kitchen_qual, data = ames_train) %>%
  step_log(sale_price, base = 10) %>%
  step_log(gr_liv_area, base = 10)
step_dummy(all_nominal_predictors())

# DEFINE A BAGGED RANDOM FOREST MODEL
bagged_spec <- bag_tree(
  tree_depth = tune(),
  min_n = tune(),
  cost_complexity = tune()
) %>%
  set_mode("regression") %>%
  set_engine("rpart", times = 25L)

# DEFINE A RANGER RANDOM FOREST MODEL
rf_spec <-
  rand_forest(
    mtry = tune(),
    min_n = tune(),
    trees = 500
  ) %>%
  set_engine("ranger") %>%
  set_mode("regression")

# DEFINE AN XGBOOST MODEL
xgb_spec <- boost_tree(
  trees = 500,
  tree_depth = tune(),
  min_n = tune(),
  loss_reduction = tune(),
  sample_size = tune(),
  mtry = tune(),
  learn_rate = tune()
) %>%
  set_engine("xgboost", importance = TRUE) %>%
  set_mode("regression")

# DEFINE A BOOSTED TREE ENSEMBLE MODEL
bt_spec <-
  boost_tree(
    learn_rate = tune(),
    stop_iter = tune(),
    trees = 500
  ) %>%
  set_engine("lightgbm", num_leaves = tune()) %>%
  set_mode("regression")

wflw_set <-
  workflow_set(
    preproc = list(base = base_rec, bt = bt_rec),
    models = list(xgb = xgb_spec, bagged = bagged_spec, rf = rf_spec, bt = bt_spec)
  )

# UPDATE MTRY PARAMETER FOR THE BASE XGBOOST
base_xgb_param <- wflw_set %>%
  extract_workflow(
    id = "base_xgb"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 277)))

# UPDATE MTRY PARAMETER FOR THE BASE RF MODEL
base_rf_param <- wflw_set %>%
  extract_workflow(
    id = "base_rf"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 277)))

# UPDATE MTRY PARAMETER FOR THE BASE BAGGED MODEL
bt_xgb_param <- wflw_set %>%
  extract_workflow(
    id = "bt_xgb"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 21)))

# UPDATE MTRY PARAMETER FOR THE BASE BAGGED MODEL
bt_rf_param <- wflw_set %>%
  extract_workflow(
    id = "bt_rf"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 21)))

# UPDATE THE WORKFLOW SET WITH THE NEW PARAMETERS
wf_set_tune_list_finalize <- wflw_set %>%
  option_add(param_info = base_xgb_param, id = "base_xgb") %>%
  option_add(param_info = base_rf_param, id = "base_rf") %>%
  option_add(param_info = bt_xgb_param, id = "bt_xgb") %>%
  option_add(param_info = bt_rf_param, id = "bt_rf")

# SPECIFY THE TUNE GRID
race_ctrl <-
  control_race(
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )

# DETECT THE NUMBER OF CORES
cores <- parallel::detectCores(logical = FALSE)

# CREATE A SET OF COPIES OF R RUNNING IN PARALLEL AND COMMUNICATING VIA SOCKETS
cl <- makePSOCKcluster(cores)

# REGISTER THE PARALLEL BACKEND
doParallel::registerDoParallel(cores = cl)

# APPLY RACE ANOVA TUNING TO EACH WORKFLOW IN THE WORKFLOW SET
tictoc::tic()
race_results <- wf_set_tune_list_finalize %>%
  workflow_map(
    "tune_race_anova",
    seed = 123,
    resamples = ames_folds,
    grid = 25,
    control = race_ctrl,
    verbose = TRUE
  )
tictoc::toc()

# STOP THE PARALLEL BACKEND
doParallel::stopCluster(cl)

# FINALIZE THE BEST WORKFLOW
best_results <- 
  race_results %>% 
  extract_workflow_set_result("base_xgb") %>% 
  select_best(metric = "rmse")

# EXTRACT THE XGB MODEL FROM THE WORKFLOWSET, UPDATE THE PARAMETERS, 
# AND UPDATE THE PARAMETERS WITH THE NUMERICALLY BEST SETTINGS AND FIT THE DATA 
# TO THE TRAINING SET
test_results <- 
  race_results %>% 
  extract_workflow("base_xgb") %>% 
  finalize_workflow(best_results) %>% 
  last_fit(split = ames_split, metric = "rmse")

Solution

but this is not possible with a tunable recipe which is what I'm using.

That's the issue.

You should finalize the tuning parameters (by choosing their best values). Once you have a complete recipe, you can get the features. You can use fit_best() to do that

library(tidymodels)
tidymodels_prefer()

data(Chicago)

Chicago <- 
  Chicago %>% 
  select(ridership, date, all_of(stations))

split <- initial_time_split(Chicago, prop = .995)
chi_train <- training(split)
chi_test  <- testing(split)

time_val_split <-
  sliding_period(
    chi_train,
    date,
    "month",
    lookback = 181,
    assess_stop = 1
  )


date_and_holidays_and_pca <-
  recipe(ridership ~ ., data = chi_train) %>%
  step_rm(date) %>%
  step_pca(!!stations, num_comp = tune())


lm_spec <- linear_reg() %>% set_engine("lm")
knn_spec <- nearest_neighbor(neighbors = tune()) %>% set_mode("regression")


# use this for fit_best()
ctrl_g <- control_grid(save_workflow = TRUE)

chi_set <-
  workflow_set(
    preproc = list(pca = date_and_holidays_and_pca),
    models = list(lm = lm_spec, knn = knn_spec),
    cross = TRUE
  ) %>% 
  option_add(control = ctrl_g) %>% 
  workflow_map(resamples = time_val_split, seed = 1)

# Finalized fit for each model/preproc combo
best_workflow_fits <- map(chi_set$result, fit_best)

# Get the predictor training data for the finalized model 
feature_sets <- 
  map(best_workflow_fits, 
      ~ extract_recipe(.x) %>% bake(new_data = chi_train))

^{Created on 2023-11-26 with reprex v2.0.2}