Search code examples
rxgboosttidymodelsshapley

Getting SHAP values for XGB model with workflowsets in Tidymodels


I'm trying to pass my model and the feature matrix to SHAPforxgboost but I'm having issues since I'm using a tunable recipe and model.

Most examples including this one showing how tidymodels interfaces with SHAPforxgboost have a step that requires prep() and bake() but this is not possible with a tunable recipe which is what I'm using.

Any guidance that can be shared would be greatly appreciated.

# LOAD PACKAGES
if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  tidymodels,
  tidyverse,
  doParallel,
  janitor,
  AmesHousing,
  vip,
  randomForest,
  finetune,
  lightgbm,
  pdp
)

# load the housing data and clean names
ames_data <- make_ames() %>%
  janitor::clean_names()

# split into training and testing datasets. Stratify by Sale price
ames_split <- rsample::initial_split(
  ames_no_outliers,
  prop = 0.8,
  strata = sale_price
)

# CREATE TRAINING AND TESTING OBJECTS FROM THE SPLIT OBJECT
ames_train <- training(ames_split)
ames_test <- testing(ames_split)

# CREATE RESAMPLES TO CHOOSE AND COMPARE MODELS
set.seed(234)
ames_folds <- vfold_cv(ames_train, strata = sale_price, v = 5)

# BASE RECIPE
base_rec <-
  recipe(sale_price ~ ., data = ames_train) %>%
  step_log(sale_price, base = 10) %>%
  step_YeoJohnson(lot_area, gr_liv_area) %>%
  step_other(neighborhood, threshold = .1) %>%
  step_dummy(all_nominal()) %>%
  step_zv(all_predictors()) %>%
  step_ns(latitude, longitude, deg_free = tune())

# RECIPES WITH LOG TRANSFORMATIONS
bt_rec <-
  recipe(sale_price ~ overall_qual + gr_liv_area + bsmt_qual + garage_cars + garage_area + year_built + total_bsmt_sf + exter_qual + first_flr_sf + kitchen_qual, data = ames_train) %>%
  step_log(sale_price, base = 10) %>%
  step_log(gr_liv_area, base = 10)
step_dummy(all_nominal_predictors())

# DEFINE A BAGGED RANDOM FOREST MODEL
bagged_spec <- bag_tree(
  tree_depth = tune(),
  min_n = tune(),
  cost_complexity = tune()
) %>%
  set_mode("regression") %>%
  set_engine("rpart", times = 25L)

# DEFINE A RANGER RANDOM FOREST MODEL
rf_spec <-
  rand_forest(
    mtry = tune(),
    min_n = tune(),
    trees = 500
  ) %>%
  set_engine("ranger") %>%
  set_mode("regression")

# DEFINE AN XGBOOST MODEL
xgb_spec <- boost_tree(
  trees = 500,
  tree_depth = tune(),
  min_n = tune(),
  loss_reduction = tune(),
  sample_size = tune(),
  mtry = tune(),
  learn_rate = tune()
) %>%
  set_engine("xgboost", importance = TRUE) %>%
  set_mode("regression")

# DEFINE A BOOSTED TREE ENSEMBLE MODEL
bt_spec <-
  boost_tree(
    learn_rate = tune(),
    stop_iter = tune(),
    trees = 500
  ) %>%
  set_engine("lightgbm", num_leaves = tune()) %>%
  set_mode("regression")

wflw_set <-
  workflow_set(
    preproc = list(base = base_rec, bt = bt_rec),
    models = list(xgb = xgb_spec, bagged = bagged_spec, rf = rf_spec, bt = bt_spec)
  )

# UPDATE MTRY PARAMETER FOR THE BASE XGBOOST
base_xgb_param <- wflw_set %>%
  extract_workflow(
    id = "base_xgb"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 277)))

# UPDATE MTRY PARAMETER FOR THE BASE RF MODEL
base_rf_param <- wflw_set %>%
  extract_workflow(
    id = "base_rf"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 277)))

# UPDATE MTRY PARAMETER FOR THE BASE BAGGED MODEL
bt_xgb_param <- wflw_set %>%
  extract_workflow(
    id = "bt_xgb"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 21)))

# UPDATE MTRY PARAMETER FOR THE BASE BAGGED MODEL
bt_rf_param <- wflw_set %>%
  extract_workflow(
    id = "bt_rf"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 21)))

# UPDATE THE WORKFLOW SET WITH THE NEW PARAMETERS
wf_set_tune_list_finalize <- wflw_set %>%
  option_add(param_info = base_xgb_param, id = "base_xgb") %>%
  option_add(param_info = base_rf_param, id = "base_rf") %>%
  option_add(param_info = bt_xgb_param, id = "bt_xgb") %>%
  option_add(param_info = bt_rf_param, id = "bt_rf")

# SPECIFY THE TUNE GRID
race_ctrl <-
  control_race(
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )

# DETECT THE NUMBER OF CORES
cores <- parallel::detectCores(logical = FALSE)

# CREATE A SET OF COPIES OF R RUNNING IN PARALLEL AND COMMUNICATING VIA SOCKETS
cl <- makePSOCKcluster(cores)

# REGISTER THE PARALLEL BACKEND
doParallel::registerDoParallel(cores = cl)

# APPLY RACE ANOVA TUNING TO EACH WORKFLOW IN THE WORKFLOW SET
tictoc::tic()
race_results <- wf_set_tune_list_finalize %>%
  workflow_map(
    "tune_race_anova",
    seed = 123,
    resamples = ames_folds,
    grid = 25,
    control = race_ctrl,
    verbose = TRUE
  )
tictoc::toc()

# STOP THE PARALLEL BACKEND
doParallel::stopCluster(cl)

# FINALIZE THE BEST WORKFLOW
best_results <- 
  race_results %>% 
  extract_workflow_set_result("base_xgb") %>% 
  select_best(metric = "rmse")

# EXTRACT THE XGB MODEL FROM THE WORKFLOWSET, UPDATE THE PARAMETERS, 
# AND UPDATE THE PARAMETERS WITH THE NUMERICALLY BEST SETTINGS AND FIT THE DATA 
# TO THE TRAINING SET
test_results <- 
  race_results %>% 
  extract_workflow("base_xgb") %>% 
  finalize_workflow(best_results) %>% 
  last_fit(split = ames_split, metric = "rmse")

Solution

  • but this is not possible with a tunable recipe which is what I'm using.

    That's the issue.

    You should finalize the tuning parameters (by choosing their best values). Once you have a complete recipe, you can get the features. You can use fit_best() to do that

    library(tidymodels)
    tidymodels_prefer()
    
    data(Chicago)
    
    Chicago <- 
      Chicago %>% 
      select(ridership, date, all_of(stations))
    
    split <- initial_time_split(Chicago, prop = .995)
    chi_train <- training(split)
    chi_test  <- testing(split)
    
    time_val_split <-
      sliding_period(
        chi_train,
        date,
        "month",
        lookback = 181,
        assess_stop = 1
      )
    
    
    date_and_holidays_and_pca <-
      recipe(ridership ~ ., data = chi_train) %>%
      step_rm(date) %>%
      step_pca(!!stations, num_comp = tune())
    
    
    lm_spec <- linear_reg() %>% set_engine("lm")
    knn_spec <- nearest_neighbor(neighbors = tune()) %>% set_mode("regression")
    
    
    # use this for fit_best()
    ctrl_g <- control_grid(save_workflow = TRUE)
    
    chi_set <-
      workflow_set(
        preproc = list(pca = date_and_holidays_and_pca),
        models = list(lm = lm_spec, knn = knn_spec),
        cross = TRUE
      ) %>% 
      option_add(control = ctrl_g) %>% 
      workflow_map(resamples = time_val_split, seed = 1)
    
    # Finalized fit for each model/preproc combo
    best_workflow_fits <- map(chi_set$result, fit_best)
    
    # Get the predictor training data for the finalized model 
    feature_sets <- 
      map(best_workflow_fits, 
          ~ extract_recipe(.x) %>% bake(new_data = chi_train))
    

    Created on 2023-11-26 with reprex v2.0.2