Search code examples
rtime-seriestidyversedata-manipulationtidymodels

How to handle forecast data (melt and "unmelt") generated by modeltime prediction - lost variables


below I created some fake forecast data using the tidyverse modeltime packages. I have got monthly data from 2016 and want to produce a test fc for 2020. As you can see, the data I load comes in wide format. For usage in modeltime I transform it to long data. After the modeling phase, I want to create a dataframe for the 2020 prediction values. For this purpose I need to somehow "unmelt" the data. In this process I am unfortunately losing a lot of variables. From 240 variables that I want to forecast I get only 49 in the end result. Maybe I am blind, or I do not know how to configure the modeltime functions correctly. I would really much appreciate some help. Thanks in advance!

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(modeltime))

## create some senseless data to produce forecasts on...
dates <- ymd("2016-01-01")+ months(0:59)
fake_values <- 
  c(661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
    510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
    862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
    661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
    510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
    862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
    661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
    510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
    862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
    661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
    510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
    862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239)

replicate <- rep(1,60) %*% t.default(fake_values)
replicate <- as.data.frame(replicate)

df <- bind_cols(replicate, dates) %>%
  rename(c(dates = ...241))

## melt it down
data <- reshape2::melt(df, id.var='dates')

## make some senseless forecast on senseless data...
split_obj <- initial_time_split(data, prop = 0.8)  

model_fit_prophet <- prophet_reg() %>%
  set_engine(engine = "prophet") %>%
  fit(value ~ dates, data = training(split_obj))

## model table
models_tbl_prophet <- modeltime_table(model_fit_prophet)

## calibration
calibration_tbl_prophet <- models_tbl_prophet %>%
  modeltime_calibrate(new_data = testing(split_obj))

## forecast
fc_prophet <- calibration_tbl_prophet %>%
  modeltime_forecast(
    new_data = testing(split_obj),
    actual_data = data,
    keep_data = TRUE
  ) 

## "unmelt" that bastard again
fc_prophet <- fc_prophet %>% filter(str_detect(.key,  "prediction"))
fc_prophet <- fc_prophet[,c(4,9,10)]
fc_prophet <- dplyr::filter(fc_prophet, .index >= "2020-01-01", .index <= "2020-12-01")
#fc_prophet <- fc_prophet %>% subset(fc_prophet,  as.character(.index) >"2020-01-01" & as.character(.index)< "2020-12-01" )

fc_wide_prophet <- fc_prophet %>% 
  pivot_wider(names_from = variable, values_from = value)

Solution

  • Here is my full solution. I also have provided background on what I'm doing here: https://github.com/business-science/modeltime/issues/133

    suppressPackageStartupMessages(library(tidyverse))
    suppressPackageStartupMessages(library(lubridate))
    suppressPackageStartupMessages(library(tidymodels))
    suppressPackageStartupMessages(library(modeltime))
    library(timetk)
    
    ## create some senseless data to produce forecasts on...
    dates <- ymd("2016-01-01")+ months(0:59)
    fake_values <- 
        c(661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
          510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
          862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
          661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
          510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
          862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
          661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
          510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
          862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
          661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
          510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
          862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239)
    
    replicate <- rep(1,60) %*% t.default(fake_values)
    replicate <- as.data.frame(replicate)
    
    df <- bind_cols(replicate, dates) %>%
        rename(c(dates = ...241))
    
    ## melt it down
    data <- reshape2::melt(df, id.var='dates')
    
    data %>% as_tibble() -> data
    
    
    data %>%
        filter(as.numeric(variable) %in% 1:9) %>%
        group_by(variable) %>%
        plot_time_series(dates, value, .facet_ncol = 3, .smooth = F)
        
    
    ## make some senseless forecast on senseless data...
    split_obj <- initial_time_split(data, prop = 0.8)  
    
    split_obj %>%
        tk_time_series_cv_plan() %>%
        plot_time_series_cv_plan(dates, value)
    
    
    split_obj_2 <- time_series_split(data, assess = "1 year", cumulative = TRUE)
    
    split_obj_2 %>%
        tk_time_series_cv_plan() %>%
        plot_time_series_cv_plan(dates, value)
    
    model_fit_prophet <- prophet_reg() %>%
        set_engine(engine = "prophet") %>%
        fit(value ~ dates, data = training(split_obj))
    
    ## model table
    models_tbl_prophet <- modeltime_table(model_fit_prophet)
    
    ## calibration
    calibration_tbl_prophet <- models_tbl_prophet %>%
        modeltime_calibrate(new_data = testing(split_obj_2))
    
    ## forecast
    fc_prophet <- calibration_tbl_prophet %>%
        modeltime_forecast(
            new_data = testing(split_obj_2),
            actual_data = data,
            keep_data = TRUE
        ) 
    
    fc_prophet %>%
        filter(as.numeric(variable) %in% 1:9) %>%
        group_by(variable) %>%
        plot_modeltime_forecast(.facet_ncol = 3)
    
    ## "unmelt" that bastard again
    # fc_prophet <- fc_prophet %>% filter(str_detect(.key,  "prediction"))
    # fc_prophet <- fc_prophet[,c(4,9,10)]
    # fc_prophet <- dplyr::filter(fc_prophet, .index >= "2020-01-01", .index <= "2020-12-01")
    # #fc_prophet <- fc_prophet %>% subset(fc_prophet,  as.character(.index) >"2020-01-01" & as.character(.index)< "2020-12-01" )
    # 
    # fc_wide_prophet <- fc_prophet %>% 
    #     pivot_wider(names_from = variable, values_from = value)
    
    
    # Make a future forecast
    
    refit_tbl_prophet <- calibration_tbl_prophet %>%
        modeltime_refit(data = data)
    
    future_fc_prophet <- refit_tbl_prophet %>%
        modeltime_forecast(
            new_data = data %>% group_by(variable) %>% future_frame(.length_out = "1 year"),
            actual_data = data,
            keep_data = TRUE
        )
    
    future_fc_prophet %>%
        filter(as.numeric(variable) %in% 1:9) %>%
        group_by(variable) %>%
        plot_modeltime_forecast(.facet_ncol = 3)
    
    # Reformat as wide
    
    future_wide_tbl <- future_fc_prophet %>%
        filter(.key == "prediction") %>%
        select(.model_id, .model_desc, dates, variable, .value) %>%
        pivot_wider(
            id_cols     = c(.model_id, .model_desc, dates),
            names_from  = variable, 
            values_from = .value
        )
    
    future_wide_tbl[names(df)]