Overview:
I am following a tutorial (see below) to find the best fit models from bagged trees, random forests, boosted trees and general linear models.
Tutorial (see examples below)
https://bcullen.rbind.io/post/2020-06-02-tidymodels-decision-tree-learning-in-r/
Issue
In this case, I would like to explore the data further and visualise the most important predictors (see diagram below) for my data.
My data frame is called FID and the predictors in the bagged tree model involve:
The dependent variable is Frequency (numeric)
When I try to run the plot to visualise the most important predictor, I keep on getting this error message:-
Error Message
Error: Can't subset columns that don't exist.
x Column `.extracts` doesn't exist.
Run `rlang::last_error()` to see where the error occurred.
Called from: rlang:::signal_abort(x)
If anyone has any advice on how to fix the error message, I would be deeply appreciative.
Many thanks in advance
Examples of how to produce the plot from the R-code in the tutorial
Visualise the model
Plot to show the most important predictors
My R-code
###########################################################
#split this single dataset into two: a training set and a testing set
data_split <- initial_split(FID)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data <- testing(data_split)
# resample the data with 10-fold cross-validation (10-fold by default)
cv <- vfold_cv(train_data)
###########################################################
##Produce the recipe
rec <- recipe(Frequency_Blue ~ ., data = FID) %>%
step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels
step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars")) %>% # replaces missing numeric observations with the median
step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables
###################################################################################
#####Fit the Bagged Tree Model
mod_bag <- bag_tree() %>%
set_mode("regression") %>%
set_engine("rpart", times = 10) #10 bootstrap resamples
##Create workflow
wflow_bag <- workflow() %>%
add_recipe(rec) %>%
add_model(mod_bag)
##Fit the model
plan(multisession)
fit_bag <- fit_resamples(
wflow_bag,
cv,
metrics = metric_set(rmse, rsq),
control = control_resamples(save_pred = TRUE)
)
##########################################################
##Visualise the model
##Open a plotting window
dev.new()
# extract roots
bag_roots <- function(x){
x %>%
dplyr::select(.extracts) %>%
unnest(cols = c(.extracts)) %>%
dplyr::mutate(models = map(.extracts,
~.x$FID)) %>%
dplyr::select(-.extracts) %>%
unnest(cols = c(fit_bag)) %>%
mutate(root = map_chr(fit_bag,
~as.character(.x$fit$frame[1, 1]))) %>%
dplyr::select(root)
}
# plot the bagged tree model
bag_roots(fit_bag) %>%
ggplot(mapping = aes(x = fct_rev(fct_infreq(root)))) +
geom_bar() +
coord_flip() +
labs(x = "root", y = "count")
#Error Message
Error: Can't subset columns that don't exist.
x Column `.extracts` doesn't exist.
Run `rlang::last_error()` to see where the error occurred.
Called from: rlang:::signal_abort(x)
Data frame - FID
structure(list(Year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016,
2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017), Month = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L), .Label = c("January", "February", "March",
"April", "May", "June", "July", "August", "September", "October",
"November", "December"), class = "factor"), Frequency = c(36,
28, 39, 46, 5, 0, 0, 22, 10, 15, 8, 33, 33, 29, 31, 23, 8, 9,
7, 40, 41, 41, 30, 30, 44, 37, 41, 42, 20, 0, 7, 27, 35, 27,
43, 38), Days = c(31, 28, 31, 30, 6, 0, 0, 29, 15,
29, 29, 31, 31, 29, 30, 30, 7, 0, 7, 30, 30, 31, 30, 27, 31,
28, 30, 30, 21, 0, 7, 26, 29, 27, 29, 29)), row.names = c(NA,
-36L), class = "data.frame")
There are a couple of things you need to adjust here:
extract
what you need during fit_resamples()
bag_roots()
function.It will end up like this:
library(tidymodels)
library(baguette)
FID <- structure(list(Year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016,
2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017),
Month = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L), .Label = c("January", "February", "March",
"April", "May", "June", "July", "August", "September", "October",
"November", "December"), class = "factor"),
Frequency = c(36,
28, 39, 46, 5, 0, 0, 22, 10, 15, 8, 33, 33, 29, 31, 23, 8, 9,
7, 40, 41, 41, 30, 30, 44, 37, 41, 42, 20, 0, 7, 27, 35, 27,
43, 38), Days = c(31, 28, 31, 30, 6, 0, 0, 29, 15,
29, 29, 31, 31, 29, 30, 30, 7, 0, 7, 30, 30, 31, 30, 27, 31,
28, 30, 30, 21, 0, 7, 26, 29, 27, 29, 29)), row.names = c(NA,
-36L),
class = "data.frame")
data_split <- initial_split(FID)
train_data <- training(data_split)
test_data <- testing(data_split)
cv <- vfold_cv(train_data, v = 3)
rec <- recipe(Frequency ~ ., data = FID) %>%
step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels
step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars")) %>% # replaces missing numeric observations with the median
step_dummy(all_nominal()) # dummy codes categorical variables
mod_bag <- bag_tree() %>%
set_mode("regression") %>%
set_engine("rpart", times = 10) #10 bootstrap resamples
wflow_bag <- workflow() %>%
add_recipe(rec) %>%
add_model(mod_bag)
fit_bag <- fit_resamples(
wflow_bag,
cv,
metrics = metric_set(rmse, rsq),
control = control_resamples(save_pred = TRUE,
extract = function(x) extract_model(x))
)
#>
#> Attaching package: 'rlang'
#> The following objects are masked from 'package:purrr':
#>
#> %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
#> flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
#> splice
#>
#> Attaching package: 'vctrs'
#> The following object is masked from 'package:tibble':
#>
#> data_frame
#> The following object is masked from 'package:dplyr':
#>
#> data_frame
#>
#> Attaching package: 'rpart'
#> The following object is masked from 'package:dials':
#>
#> prune
bag_roots <- function(x){
x %>%
dplyr::select(.extracts) %>%
unnest(cols = c(.extracts)) %>%
dplyr::mutate(models = map(.extracts,
~.x$model_df)) %>%
dplyr::select(-.extracts) %>%
unnest(cols = c(models)) %>%
mutate(root = map_chr(model,
~as.character(.x$fit$frame[1, 1]))) %>%
dplyr::select(root)
}
# plot the bagged tree model
library(forcats)
bag_roots(fit_bag) %>%
ggplot(mapping = aes(x = fct_rev(fct_infreq(root)))) +
geom_bar() +
coord_flip() +
labs(x = "root", y = "count")
Created on 2020-11-20 by the reprex package (v0.3.0.9001)
Not super exciting, but hopefully your real, larger dataset shows more interesting results!