I have a function that simply creates a couple of recipe objects. The issue is that inside of the function I have to rename the columns of the data.frame
/tibble
passed so that I can make the recipes
.
I don't want to do this for obvious reasons, the main being, that the column names will have to be what is in the data.frame
itself otherwise down the line they are not going to work.
Simple example:
library(tidyverse)
data_tbl <- tibble(
visit_date = seq(
from = as.Date("2021-01-01"),
to = as.Date("2021-10-15"),
by = 7,
),
visits = rnbinom(
n = 42,
size = 100,
mu = 66
)
)
ts_auto_recipe <- function(.data, .date_col, .pred_col){
# * Tidyeval ----
date_col_var <- rlang::enquo(.date_col)
pred_col_var <- rlang::enquo(.pred_col)
# * Checks ----
if(!is.data.frame(.data)){
stop(call. = FALSE, "You must supply a data.frame/tibble.")
}
if(rlang::quo_is_missing(date_col_var)){
stop(call. = FALSE, "The (.date_col) must be supplied.")
}
if(rlang::quo_is_missing(pred_col_var)){
stop(call. = FALSE, "The (.pred_col) must be supplied.")
}
# * Data ----
data_tbl <- tibble::as_tibble(.data)
data_tbl <- data_tbl %>%
dplyr::select(
{{ date_col_var }}, {{ pred_col_var }}, dplyr::everything()
) %>%
dplyr::rename(
date_col = {{ date_col_var }}
, value_col = {{ pred_col_var }}
)
# * Recipe Objects ----
# ** Base recipe ----
rec_base_obj <- recipes::recipe(
formula = date_col ~ . # I have to do the above so I can do this, which I don't like
, data = data_tbl
)
# * Add Steps ----
# ** ts signature and normalize ----
rec_date_obj <- rec_base_obj %>%
timetk::step_timeseries_signature(date_col) %>%
recipes::step_normalize(
dplyr::contains("index.num")
, dplyr::contains("date_col_year")
)
# * Recipe List ----
rec_lst <- list(
rec_base = rec_base_obj,
rec_date = rec_date_obj
)
# * Return ----
return(rec_lst)
}
rec_objs <- ts_auto_recipe(data_tbl, visit_date, visits)
The reason I am doing this is because I cannot use dynamic names inside of the recipe function itself, so something like rlang::sym(names(data_tbl)[[1]])
will not work, nor would something like data_tbl[[1]]
. I was thinking of using something like step_rename()
but that would require you to know the name ahead of time and it cannot be a variable inside of the recipe step. However you can pass a variable to something like timetk::step_time_series_signature
The only other thing I could think of was to force users to use specific column name like in the Facebook Prophet R library of ds
and y
I also notice I get some funky output to the terminal when I run rec_objs
I get the following:
> rec_objs
$rec_base
Recipe
Inputs:
role #variables
outcome 1
predictor 1
$rec_date
Recipe
Inputs:
role #variables
outcome 1
predictor 1
Operations:
Timeseries signature features from date_col
Centering and scaling for dplyr::contains("ÿþindex.numÿþ"), dplyr::contains("ÿþdate_col...
Yet when I do:
> rec_objs[[2]]
Recipe
Inputs:
role #variables
outcome 1
predictor 1
Operations:
Timeseries signature features from date_col
Centering and scaling for dplyr::contains("index.num"), dplyr::contains("date_col_year")
It does not happen.
Thank you,
I think I have found a solution to this problem, see the following custom function:
ts_auto_recipe_b <- function(.data
, .date_col
, .pred_col
, .step_ts_sig = TRUE
, .step_ts_rm_misc = TRUE
, .step_ts_dummy = TRUE
, .step_ts_fourier = TRUE
, .step_ts_fourier_period = 1
, .K = 1
, .step_ts_yeo = TRUE
, .step_ts_nzv = TRUE) {
# * Tidyeval ----
date_col_var_expr <- rlang::enquo(.date_col)
pred_col_var_expr <- rlang::enquo(.pred_col)
step_ts_sig <- .step_ts_sig
step_ts_rm_misc <- .step_ts_rm_misc
step_ts_dummy <- .step_ts_dummy
step_ts_fourier <- .step_ts_fourier
step_ts_fourier_k <- .K
step_ts_fourier_period <- .step_ts_fourier_period
step_ts_yeo <- .step_ts_yeo
step_ts_nzv <- .step_ts_nzv
# * Checks ----
if(!is.data.frame(.data)){
stop(call. = FALSE, "You must supply a data.frame/tibble.")
}
if(rlang::quo_is_missing(date_col_var_expr)){
stop(call. = FALSE, "The (.date_col) must be supplied.")
}
if(rlang::quo_is_missing(pred_col_var_expr)){
stop(call. = FALSE, "The (.pred_col) must be supplied.")
}
# * Data ----
data_tbl <- tibble::as_tibble(.data)
data_tbl <- data_tbl %>%
dplyr::select(
{{ date_col_var_expr }}
, {{ pred_col_var_expr }}
, dplyr::everything()
)
# %>%
# dplyr::rename(
# date_col = {{ date_col_var_expr }}
# , value_col = {{ pred_col_var_expr }}
# )
# Original Col names ----
ds <- rlang::sym(names(data_tbl)[[1]])
v <- rlang::sym(names(data_tbl)[[2]])
f <- as.formula(paste(v, " ~ ."))
# * Recipe Objects ----
# ** Base recipe ----
rec_base_obj <- recipes::recipe(
formula = f
, data = data_tbl
)
# * Add Steps ----
# ** ts signature and normalize ----
if(step_ts_sig){
rec_date_obj <- rec_base_obj %>%
timetk::step_timeseries_signature(ds) %>%
recipes::step_normalize(
dplyr::contains("index.num")
, dplyr::contains("date_col_year")
)
}
# ** Step rm ----
if(step_ts_rm_misc){
rec_date_obj <- rec_date_obj %>%
recipes::step_rm(dplyr::matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)"))
}
# ** Step Dummy ----
if(step_ts_dummy){
rec_date_obj <- rec_date_obj %>%
recipes::step_dummy(recipes::all_nominal_predictors(), one_hot = TRUE)
}
# ** Step Fourier ----
if(step_ts_fourier){
rec_date_fourier_obj <- rec_date_obj %>%
timetk::step_fourier(
ds
, period = 1#step_ts_fourier_period
, K = 1#step_ts_fourier_k
)
}
# ** Step YeoJohnson ----
if(step_ts_yeo){
rec_date_fourier_obj <- rec_date_fourier_obj %>%
recipes::step_YeoJohnson(!!v, limits = c(0, 1))
}
# ** Step NZV ----
if(step_ts_nzv){
rec_date_fourier_nzv_obj <- rec_date_fourier_obj %>%
recipes::step_nzv(recipes::all_predictors())
}
# * Recipe List ----
rec_lst <- list(
rec_base = rec_base_obj,
rec_date = rec_date_obj,
rec_date_fourier = rec_date_fourier_obj,
rec_date_fourier_nzv = rec_date_fourier_nzv_obj
)
# * Return ----
return(rec_lst)
}
Then running the following:
> rec_objs <- ts_auto_recipe_b(.data = data_tbl, .date_col = visit_date, .pred_col = visits)
> rec_objs[[1]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits"
> rec_objs[[2]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_year" "visit_date_half" "visit_date_quarter"
[7] "visit_date_month" "visit_date_day" "visit_date_wday"
[10] "visit_date_mday" "visit_date_qday" "visit_date_yday"
[13] "visit_date_mweek" "visit_date_week" "visit_date_week2"
[16] "visit_date_week3" "visit_date_week4" "visit_date_mday7"
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1" "visit_date_wday.lbl_2" "visit_date_wday.lbl_3"
[34] "visit_date_wday.lbl_4" "visit_date_wday.lbl_5" "visit_date_wday.lbl_6"
[37] "visit_date_wday.lbl_7"
> rec_objs[[3]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_year" "visit_date_half" "visit_date_quarter"
[7] "visit_date_month" "visit_date_day" "visit_date_wday"
[10] "visit_date_mday" "visit_date_qday" "visit_date_yday"
[13] "visit_date_mweek" "visit_date_week" "visit_date_week2"
[16] "visit_date_week3" "visit_date_week4" "visit_date_mday7"
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1" "visit_date_wday.lbl_2" "visit_date_wday.lbl_3"
[34] "visit_date_wday.lbl_4" "visit_date_wday.lbl_5" "visit_date_wday.lbl_6"
[37] "visit_date_wday.lbl_7" "visit_date_sin1_K1" "visit_date_cos1_K1"
> rec_objs[[4]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_half" "visit_date_quarter" "visit_date_month"
[7] "visit_date_day" "visit_date_mday" "visit_date_qday"
[10] "visit_date_yday" "visit_date_mweek" "visit_date_week"
[13] "visit_date_week2" "visit_date_week3" "visit_date_week4"
[16] "visit_date_mday7" "visit_date_month.lbl_01" "visit_date_month.lbl_02"
[19] "visit_date_month.lbl_03" "visit_date_month.lbl_04" "visit_date_month.lbl_05"
[22] "visit_date_month.lbl_06" "visit_date_month.lbl_07" "visit_date_month.lbl_08"
[25] "visit_date_month.lbl_09" "visit_date_month.lbl_10" "visit_date_sin1_K1"
[28] "visit_date_cos1_K1"
Will show that visit_date
and visits
were passed as desired to the functions by making use of !!v
for recipes functions, where as timetk
allows for passing variables.