Search code examples
rggplot2time-seriesforecasting

forecast in ggplot2 with discontinuous time series


Consider this example file. I would like to forecast the weight by time. Usually I would do this with the code below, but the hitch is that the dates I have now are discontinuous. The oldest ones are every once in a while, while the latest once are on daily basis. I have read somewhere that I shall use the xts package rather than ts in this case.

The error message I get is:

Warning message:
In window.default(x, ...) : 'end' value not changed

And:

 Error in window.default(x, ...) : 'start' cannot be after 'end' 

Where do I have to adjust the below code to get my forecast running? Shall I extrapolate the missing weights and than use the ts on that daily measurement?

require(ggplot2)
require(zoo) # as.yearmon() function
require(forecast) # for forecasting
require(xts) # extensible time series

x <- get.url(https://dl.dropboxusercontent.com/u/109495328/example.csv)
app_df <- read.csv(x, header=T, sep = ",", quote = "", stringsAsFactors = FALSE, na.strings = "..")     
colnames(app_df) <- c("Date", "Weight")

date <- as.Date(strptime(app_df$Date, "%d.%m.%Y"))
weight <- app_df$Weight
df <- na.omit(data.frame(date,weight))

w <- as.numeric(weight) # ask: modifyingfunction with xts
myts <- ts(w, start = c(2016), end = c(2016), freq = 7) # add time dimension
# tail(weight, n=1)

funggcast <- function(dn, fcast){

  en <- max(time(fcast$mean)) # Extract the max date used in the forecast

  # Extract Source and Training Data
  ds <- as.data.frame(window(dn, end = en))
  names(ds) <- 'observed'
  ds$date <- as.Date(time(window(dn, end = en)))

  # Extract the Fitted Values (need to figure out how to grab confidence intervals)
  dfit <- as.data.frame(fcast$fitted)
  dfit$date <- as.Date(time(fcast$fitted))
  names(dfit)[1] <- 'fitted'

  ds <- merge(ds, dfit, all.x = T) # Merge fitted values with source and training data

  # Extract the Forecast values and confidence intervals
  dfcastn <- as.data.frame(fcast)
  dfcastn$date <- as.Date(paste(row.names(dfcastn),"01","01",sep="-"))
  names(dfcastn) <- c('forecast','lo80','hi80','lo95','hi95','date')

  pd <- merge(ds, dfcastn,all= T) # final data.frame for use in ggplot
  return(pd)

} # ggplot function by Frank Davenport

yt <- window(myts, end = c(4360)) # extract training data until last year
yfit <- auto.arima(yt) # fit arima model
yfor <- forecast(yfit) # forecast
pd <- funggcast(myts, yfor) # extract the data for ggplot using function funggcast()

ggplot(data = pd, aes(x = date, y = observed)) +
  geom_line(aes(color = "1")) +
  geom_line(aes(y = fitted,color="2")) +
  geom_line(aes(y = forecast,color="3")) +
  scale_colour_manual(values=c("red", "blue","black"),labels = c("Observed", "Fitted", "Forecasted"),name="Data") +
  geom_ribbon(aes(ymin = lo95, ymax = hi95), alpha = .25)

Solution

  • Well, this seems close to what you probably want. The funggcast function was making assumptions about the dates that were just not even close to being true, so I changed it to make it work. And I created an xts. And I got rid of all the window stuff which didn't seem to make any sense for this data.

    # R Script
    require(ggplot2)
    require(zoo) # as.yearmon() function
    require(forecast) # for forecasting
    require(xts) # extensible time series
    require(RCurl)
    
    x <- getURL("https://dl.dropboxusercontent.com/u/109495328/example.csv")
    app_df <- read.csv(text=x, header = T, sep = ",", quote = "",
                          stringsAsFactors = FALSE, na.strings = "..")
    colnames(app_df) <- c("Date", "Weight")
    
    date <- as.Date(strptime(app_df$Date, "%d.%m.%Y"))
    weight <- app_df$Weight
    df <- na.omit(data.frame(date, weight))
    
    w <- as.numeric(weight) # ask: modifyingfunction with xts
    myts <- xts(weight, order.by=date)
    # tail(weight, n=1)
    
    funggcast_new <- function(dn, fcast) {
    
       # en <- max(time(fcast$mean)) # Extract the max date used in the forecast (?)
        # Extract Source and Training Data
        ds <- as.data.frame(dn[,1])
        names(ds) <- 'observed'
        ds$date <- time(dn)
    
        # Extract the Fitted Values (need to figure out how to grab confidence intervals)
        dfit <- as.data.frame(fcast$fitted)
        dfit$date <- ds$date
        names(dfit)[1] <- 'fitted'
    
        ds <- merge(ds, dfit, all.x = T) # Merge fitted values with source and training data
    
        # Extract the Forecast values and confidence intervals
        dfcastn <- as.data.frame(fcast)
        dfcastn$date <- time(fcast) + time(dn)[length(dn)]
    
        names(dfcastn) <- c('forecast', 'lo80', 'hi80', 'lo95', 'hi95', 'date')
    
        pd <- merge(ds, dfcastn, all = T) # final data.frame for use in ggplot
        return(pd)
    }
    # ggplot function by Frank Davenport
    
    # yt <- window(myts, end = c(4360)) # extract training data until last year (?)
    yt <- myts
    yfit <- auto.arima(yt) # fit arima model
    yfor <- forecast(yfit) # forecast
    pd <- funggcast_new(myts, yfor) # extract the data for ggplot using function funggcast()
    
    ggplot(data = pd, aes(x = date, y = observed)) +
      geom_line(aes(color = "1")) +
      geom_line(aes(y = fitted, color = "2")) +
      geom_line(aes(y = forecast, color = "3")) +
      scale_colour_manual(values = c("red", "blue", "black"), 
              labels = c("Observed", "Fitted", "Forecasted"), name = "Data") +
      geom_ribbon(aes(ymin = lo95, ymax = hi95), alpha = .25)
    

    Yielding:

    enter image description here