Search code examples
rworkflowtidymodels

Plotting Backtested Workflow_Set data


I'm trying to view how this model performs against prior actual close. I'm using a workflow_set model and have no issues extracting the forecast. I've supplied a reproducible example below. I'd like to be able to plot actual, with a backtested trend line along with the forecast.

tickers <- "TSLA"

first.date <- Sys.Date() - 3000
last.date <- Sys.Date()
freq.data <- "daily"


stocks <- BatchGetSymbols::BatchGetSymbols(tickers = tickers,
                                           first.date = first.date,
                                           last.date = last.date,
                                           freq.data = freq.data ,
                                           do.cache = FALSE,
                                           thresh.bad.data = 0) 

stocks <- stocks %>% as.data.frame() %>% select(Date = df.tickers.ref.date, Close = df.tickers.price.close) 

time_val_split <- 
  stocks %>%
  sliding_period(
    Date,
    period = "day",
    every = 52)

data_extended <- stocks %>%
  future_frame(
    .length_out = 60,
    .bind_data  = TRUE
  ) %>%
  ungroup()

train_tbl <- data_extended %>% drop_na() 

future_tbl <- data_extended %>% filter(is.na(Close))

base_rec <- recipe(Close ~ Date, train_tbl) %>%
  step_timeseries_signature(Date) %>%
  step_rm(matches("(.xts$)|(.iso$)|(.lbl)|(hour)|(minute)|(second)|(am.pm)|(mweek)|(qday)|(week2)|(week3)|(week4)")) %>%
  step_dummy(all_nominal(), one_hot = TRUE) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_scale(all_numeric_predictors()) %>%
  step_rm(Date)
 
cubist_spec <- 
  cubist_rules(committees = tune(),
               neighbors = tune()) %>% 
  set_engine("Cubist") 

rf_spec <- 
  rand_forest(mtry = tune(), 
              min_n = tune(),
              trees = 1000) %>% 
  set_engine("ranger") %>% 
  set_mode("regression")

base <- 
  workflow_set(
    preproc = list(base_date = base_rec),
    models = list(
      cubist_base = cubist_spec,
      cart_base = cart_spec
    ))

all_workflows <- 
  bind_rows(
    base
  )

cores <- parallel::detectCores(logical = FALSE)
clusters <- parallel::makePSOCKcluster(cores)
doParallel::registerDoParallel(clusters)

wflwset_tune_results <- 
  all_workflows %>%
  workflow_map(
    fn = "tune_race_anova",
    seed = 1,
    resamples = time_val_split,
    grid = 2,
    verbose = TRUE)

doParallel::stopImplicitCluster()

best_for_each_mod <- wflwset_tune_results %>%
  rank_results(select_best = TRUE) %>% 
  filter(.metric == "rmse") %>%
  select(wflow_id, .config, mean, preprocessor, model)

b_mod <- best_for_each_mod %>%
  arrange(mean) %>%
  head(1) %>%
  select(wflow_id) %>% as.character()


best_param <- wflwset_tune_results %>% extract_workflow_set_result(id = b_mod) %>% select_best(metric = "rmse") 

# Finalize model with best param
best_finalized <- wflwset_tune_results %>%
  extract_workflow(b_mod) %>%
  finalize_workflow(best_param) %>%
  fit(train_tbl)

At this point the model has been trained but I can't seem to figure out how to run it against prior actuals. My goal is to bind the backed results with the predictions below.

prediction_tbl <- best_finalized %>%
  predict(new_data = future_tbl) %>%
  bind_cols(future_tbl) %>%
  select(.pred, Date) %>%
  mutate(type = "prediction") %>%
  rename(Close = .pred)

 train_tbl %>% mutate(type = "actual") %>% rbind(prediction_tbl) %>%
  ggplot(aes(Date, Close, color = type)) +
  geom_line(size = 2)

Solution

  • Based on your comment, I'd recommend using pivot_longer() after binding the future_tbl to your predictions. This lets you keep everything in one pipeline, rather than having to create two separate dataframes then bind them together. Here's an example plotting the prediction & actual values against mpg. Hope this helps!

    library(tidymodels)
    #> Registered S3 method overwritten by 'tune':
    #>   method                   from   
    #>   required_pkgs.model_spec parsnip
    
    # split data
    set.seed(123)
    mtcars <- as_tibble(mtcars)
    cars_split <- initial_split(mtcars)
    cars_train <- training(cars_split)
    cars_test <- testing(cars_split)
    
    # plot truth & prediction against another variable
    workflow() %>%
      add_model(linear_reg() %>% set_engine("lm")) %>%
      add_recipe(recipe(qsec ~ ., data = cars_train)) %>%
      fit(cars_train) %>%
      predict(cars_test) %>%
      bind_cols(cars_test) %>%
      pivot_longer(cols = c(.pred, qsec),
                   names_to = "comparison",
                   values_to = "value") %>%
      ggplot(aes(x = mpg,
                 y = value,
                 color = comparison)) +
      geom_point(alpha = 0.75)
    

    Created on 2021-11-18 by the reprex package (v2.0.1)