I am attempting to include a dummy regressor that notes the beginning of the pandemic and runs a regression with ARIMA errors.
My dataset revolves around breaking & entering's happening in Toronto from 2014 to 2021. The issue is that the trend takes a turn due to covid-19 around 2020.
Auto.arima
provides me with a ARIMA(1,0,1) model as it is not taking into account the impact of covid-19 and is performing according to the implied return to the series average.
When trying to include a dummy regressor that notes the beginning of the pandemic and run a regression with ARIMA errors I get the following error:
In ifelse(time(BEDATA_GROUPEDtsssarima) >= yearmonth("2020-03"), :
Incompatible methods ("Ops.ts", ">=.vctrs_vctr") for ">="
Code:
# Create a binary time series that indicates the start of the pandemic
library(fpp3)
library(forecast)
library(zoo)
# Check if timeseries
class(BEDATA_GROUPED)
#Convert timeseries
BEDATA_GROUPEDtsssarima <- ts(BEDATA_GROUPED[,2], frequency = 12, start = c(2014, 1))
class(BEDATA_GROUPEDtsssarima)
#Plot
forecast::autoplot(BEDATA_GROUPEDtsssarima)
# Assume that the pandemic began in March 2020
pandemic_dummy <- ifelse(time(BEDATA_GROUPEDtsssarima) >= yearmonth("2020-03"), 1, 0)
# Use auto.arima() to fit an ARIMA model with the dummy variable as an exogenous variable
beddatamodel <- auto.arima(BEDATA_GROUPEDtsssarima, xreg = pandemic_dummy, ic="aic", trace = TRUE)
# Create a binary time series that indicates the start of the pandemic
# In this example, we will assume that the pandemic began in March 2020
pandemic_dummy <- ifelse(time(BEDATA_GROUPEDtsssarima) >= yearmonth("2020-03"), 1, 0)
# Use auto.arima() to fit an ARIMA model with the dummy variable as an exogenous variable
beddatamodel <- auto.arima(BEDATA_GROUPEDtsssarima, xreg = pandemic_dummy, ic="aic", trace = TRUE)
# Create a binary time series for the forecast period that includes the pandemic dummy variable
forecast_period <- time(BEDATA_GROUPEDtsssarima)["2022/01/01/":"2023/12/31/"]
pandemic_dummy_forecast <- ifelse(forecast_period >= yearmonth("2020-03"), 1, 0)
# Use the forecast()
forecast(pandemic_dummy_forecast)
Dataset:
structure(list(occurrence_yrmn = c("2014-January", "2014-February",
"2014-March", "2014-April", "2014-May", "2014-June", "2014-July",
"2014-August", "2014-September", "2014-October", "2014-November",
"2014-December", "2015-January", "2015-February", "2015-March",
"2015-April", "2015-May", "2015-June", "2015-July", "2015-August",
"2015-September", "2015-October", "2015-November", "2015-December",
"2016-January", "2016-February", "2016-March", "2016-April",
"2016-May", "2016-June", "2016-July", "2016-August", "2016-September",
"2016-October", "2016-November", "2016-December", "2017-January",
"2017-February", "2017-March", "2017-April", "2017-May", "2017-June",
"2017-July", "2017-August", "2017-September", "2017-October",
"2017-November", "2017-December", "2018-January", "2018-February",
"2018-March", "2018-April", "2018-May", "2018-June", "2018-July",
"2018-August", "2018-September", "2018-October", "2018-November",
"2018-December", "2019-January", "2019-February", "2019-March",
"2019-April", "2019-May", "2019-June", "2019-July", "2019-August",
"2019-September", "2019-October", "2019-November", "2019-December",
"2020-January", "2020-February", "2020-March", "2020-April",
"2020-May", "2020-June", "2020-July", "2020-August", "2020-September",
"2020-October", "2020-November", "2020-December", "2021-January",
"2021-February", "2021-March", "2021-April", "2021-May", "2021-June",
"2021-July", "2021-August", "2021-September", "2021-October",
"2021-November", "2021-December"), MCI = c(586, 482, 567, 626,
625, 610, 576, 634, 636, 663, 657, 556, 513, 415, 510, 542, 549,
618, 623, 666, 641, 632, 593, 617, 541, 523, 504, 536, 498, 552,
522, 519, 496, 541, 602, 570, 571, 492, 560, 525, 507, 523, 593,
623, 578, 657, 683, 588, 664, 582, 619, 512, 630, 644, 563, 654,
635, 732, 639, 748, 719, 567, 607, 746, 739, 686, 805, 762, 696,
777, 755, 675, 704, 617, 732, 609, 464, 487, 565, 609, 513, 533,
505, 578, 526, 418, 428, 421, 502, 452, 509, 492, 478, 469, 457,
457)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-96L))
I see you have used the fpp3 library, so I've had a go using the tidyverts tools. I've had a go at three models: a plain ARIMA, a plain regression using the pandemic dummy variable, and a dynamic model using both ARIMA and the dummy variable. Hope this helps! :-)
library(tsibble)
library(fable)
library(fabletools)
library(feasts)
library(dplyr)
Create a tsibble:
BEDATA_GROUPED <- BEDATA_GROUPED |>
mutate(Month = yearmonth(occurrence_yrmn)) |>
as_tsibble(index = Month)
autoplot(BEDATA_GROUPED)
Assume that the pandemic began in March 2020 and create a dummy variable:
pandemic_start <- yearmonth("2020-03-01")
BEDATA_GROUPED <- BEDATA_GROUPED |>
mutate(pandemic_dummy = ifelse(Month >= pandemic_start, 1, 0))
Work up a plain ARIMA:
BEDATA_GROUPED_arima <- BEDATA_GROUPED |>
model(ARIMA(MCI, stepwise = FALSE))
BEDATA_GROUPED_arima |>
gg_tsresiduals()
BEDATA_GROUPED_arima |>
forecast(h = 5) |>
autoplot()
Work up a plain regression:
BEDATA_GROUPED_TSLM <- BEDATA_GROUPED |>
model(TSLM(MCI ~ pandemic_dummy)) |>
report()
BEDATA_GROUPED_TSLM |>
gg_tsresiduals()
Make a data set to predict on:
new_data <- structure(list(Month = structure(c(18993, 19024, 19052, 19083,
19113), class = c("yearmonth", "vctrs_vctr")), pandemic_dummy = c(1,
1, 1, 1, 1)), class = c("tbl_ts", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L), key = structure(list(.rows = structure(list(
1:5), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr",
"list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-1L)), index = structure("Month", ordered = TRUE), index2 = "Month", interval = structure(list(
year = 0, quarter = 0, month = 1, week = 0, day = 0, hour = 0,
minute = 0, second = 0, millisecond = 0, microsecond = 0,
nanosecond = 0, unit = 0), .regular = TRUE, class = c("interval",
"vctrs_rcrd", "vctrs_vctr")))
Forecast plain regression:
BEDATA_GROUPED_TSLM |>
forecast(new_data = new_data) |>
autoplot()
Work up a dynamic regression, with ARIMA and the pandemic dummy variablee:
BEDATA_GROUPED_dyn_ARIMA <- BEDATA_GROUPED |>
model(ARIMA(MCI ~ pandemic_dummy)) |>
report()
BEDATA_GROUPED_dyn_ARIMA |>
gg_tsresiduals()
BEDATA_GROUPED_dyn_ARIMA |>
forecast(new_data = new_data) |>
autoplot()