Search code examples
pythontime-seriesstatsmodelsholtwinters

Interpolation using ExponentialSmoothing from stats models


I am using ExponentialSmoothing from statsmodels to run Holt-Winters method on time series. I get forecasted values but can not extract calculated values and compare them with observed values.

from pandas import Series
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.api import ExponentialSmoothing

modelHW = ExponentialSmoothing(np.asarray(passtrain_df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()

y_hat_avg['Holt_Winter'] = modelHW.forecast(prediction_size)

So here, prediction_size = number of forecasted datapoints (4 in my case) passtrain_df is a dataframe with observations (140 datapoints) based on which Holt_Winter model is built (regression).

I can easily display 4 forecasted values.

How do I extract 140 calculated values?

Tried to use:

print(ExponentialSmoothing.predict(np.asarray(passtrain_df), start=0, end=139))

But I probably have a syntax error somewhere

Thank you!


Solution

  • Edit:

    • Replaced synthetic dataset with sample data from OP

    • Fixed function that builds new forecast period

    • Fixed x-axis date format as per OPs request

    Answer:

    If you're looking for calculated values within your estimation period, you should use modelHW.fittedvalues and not modelHW.forecast(). The latter will give you just what it says; forecasts. And it's pretty awesome. Let me show you how to do both things:

    Plot 1 - Model within estimation period

    enter image description here

    Plot 2 - Forecasts

    enter image description here

    Code:

    #imports
    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np
    from statsmodels.tsa.api import ExponentialSmoothing
    import matplotlib.dates as mdates
    #%%
    #
    
    # Load data
    pass_df = pd.read_csv('https://raw.githubusercontent.com/dacatay/time-series-analysis/master/data/passengers.csv', sep=';')
    pass_df = pass_df.set_index('month')
    type(pass_df.index)
    
    df = pass_df.copy()
    
    # Model
    modelHW = ExponentialSmoothing(np.asarray(df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
    modelHW.summary()
    
    # Model, fitted values
    model_values = modelHW.fittedvalues
    model_period = df.index
    df_model = pd.concat([df['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
    df_model.columns = ['n_passengers', 'HWmodel']
    df_model = df_model.set_index(pd.DatetimeIndex(df_model.index))
    
    # Model, plot
    fig, ax = plt.subplots()
    myFmt = mdates.DateFormatter('%Y-%m')
    df_model.plot(ax = ax, x_compat=True)
    ax.xaxis.set_major_formatter(myFmt)
    
    # Forecasts
    prediction_size = 10
    forecast_values = modelHW.forecast(prediction_size)
    
    # Forecasts, build new period 
    forecast_start = df.index[-1]
    forecast_start = pd.to_datetime(forecast_start, format='%Y-%m-%d')
    forecast_period = pd.period_range(forecast_start, periods=prediction_size+1, freq='M')
    forecast_period = forecast_period[1:]
    
    # Forecasts, create dataframe
    df_forecast = pd.Series(forecast_values, index = forecast_period.values).to_frame()
    df_forecast.columns = ['HWforecast']
    
    # merge input and forecast dataframes
    df_all = pd.merge(df,df_forecast, how='outer', left_index=True, right_index=True)
    #df_all = df_all.set_index(pd.DatetimeIndex(df_all.index.values))
    ix = df_all.index
    ixp = pd.PeriodIndex(ix, freq = 'M')
    df_all = df_all.set_index(ixp)
    
    # Forecast, plot
    fig, ax = plt.subplots()
    myFmt = mdates.DateFormatter('%Y-%m')
    df_all.plot(ax = ax, x_compat=True)
    ax.xaxis.set_major_formatter(myFmt)
    

    Previous attempts:

    # imports
    import pandas as pd
    import numpy as np
    from statsmodels.tsa.api import ExponentialSmoothing
    
    # Data that matches your setup, but with a random
    # seed to make it reproducible
    np.random.seed(42)
    
    # Time
    date = pd.to_datetime("1st of Jan, 2019")
    dates = date+pd.to_timedelta(np.arange(140), 'D')
    
    # Data
    n_passengers = np.random.normal(loc=0.0, scale=5.0, size=140).cumsum()
    n_passengers = n_passengers.astype(int) + 100
    df = pd.DataFrame({'n_passengers':n_passengers},index=dates)
    

    1. How to plot observed vs. estimated values within the estimation period:

    The following snippet will extract all fitted values and plot it against your observed values.

    Snippet 2:

    # Model
    modelHW = ExponentialSmoothing(np.asarray(df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
    modelHW.summary()
    
    # Model, fitted values
    model_values = modelHW.fittedvalues
    model_period = df.index
    df_model = pd.concat([df['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
    df_model.columns = ['n_passengers', 'HWmodel']
    df_model.plot()
    

    Plot 1:

    enter image description here

    2. How to produce and plot model forecasts of a certain length:

    The following snippet will produce 10 forecasts from your model, and plot it as an extended period compared to your observer values.

    Snippet 3:

    # Forecast
    prediction_size = 10
    forecast_values = modelHW.forecast(prediction_size)
    forecast_period = df.index[-1] + pd.to_timedelta(np.arange(prediction_size+1), 'D')
    forecast_period  = forecast_period[1:]
    
    df_forecast = pd.concat([df['n_passengers'], pd.Series(forecast_values, index = forecast_period)], axis = 1)
    df_forecast.columns = ['n_passengers', 'HWforecast']
    df_forecast.plot()
    

    Plot 2:

    enter image description here


    And here's the whole thing for an easy copy&paste:

    # imports
    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np
    from statsmodels.tsa.api import ExponentialSmoothing
    
    # Data that matches your setup, but with a random
    # seed to make it reproducible
    np.random.seed(42)
    
    # Time
    date = pd.to_datetime("1st of Jan, 2019")
    dates = date+pd.to_timedelta(np.arange(140), 'D')
    
    # Data
    n_passengers = np.random.normal(loc=0.0, scale=5.0, size=140).cumsum()
    n_passengers = n_passengers.astype(int) + 100
    df = pd.DataFrame({'n_passengers':n_passengers},index=dates)
    
    # Model
    modelHW = ExponentialSmoothing(np.asarray(df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
    modelHW.summary()
    
    # Model, fitted values
    model_values = modelHW.fittedvalues
    model_period = df.index
    df_model = pd.concat([df['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
    df_model.columns = ['n_passengers', 'HWmodel']
    df_model.plot()
    
    # Forecast
    prediction_size = 10
    forecast_values = modelHW.forecast(prediction_size)
    forecast_period = df.index[-1] + pd.to_timedelta(np.arange(prediction_size+1), 'D')
    forecast_period  = forecast_period[1:]
    
    df_forecast = pd.concat([df['n_passengers'], pd.Series(forecast_values, index = forecast_period)], axis = 1)
    df_forecast.columns = ['n_passengers', 'HWforecast']
    df_forecast.plot()
    

    @vestland - here is the code and error:

    y_train = passtrain_df.copy(deep=True)
    
    model_HW = ExponentialSmoothing(np.asarray(y_train['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
    
    model_values = model_HW.fittedvalues
    model_period = y_train.index
    
    hw_model = pd.concat([y_train['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
    hw_model.columns = ['Observed Passengers', 'Holt-Winters']
    
    plt.figure(figsize=(18,12))
    hw_model.plot()
    
    forecast_values = model_HW.forecast(prediction_size)
    forecast_period = y_train.index[-1] + pd.to_timedelta(np.arange(prediction_size+1),'D')
    forecast_period  = forecast_period[1:]
    
    hw_forecast = pd.concat([y_train['n_passengers'], pd.Series(forecast_values, index = forecast_period)], axis = 1)
    hw_forecast.columns = ['Observed Passengers', 'HW-Forecast']
    hw_forecast.plot()
    

    Error:

    NullFrequencyError     Traceback (most recent call last)
    <ipython-input-25-5f37a0dd0cfa> in <module>()
         17 
         18 forecast_values = model_HW.forecast(prediction_size)
    ---> 19 forecast_period = y_train.index[-1] +  pd.to_timedelta(np.arange(prediction_size+1),'D')
         20 forecast_period  = forecast_period[1:]
         21 
    
    /anaconda3/lib/python3.6/site- packages/pandas/core/indexes/datetimelike.py in __radd__(self, other)
        879         def __radd__(self, other):
        880             # alias for __add__
    --> 881             return self.__add__(other)
        882         cls.__radd__ = __radd__
        883 
    
    /anaconda3/lib/python3.6/site- packages/pandas/core/indexes/datetimelike.py in __add__(self, other)
        842                 # This check must come after the check for  np.timedelta64
        843                 # as is_integer returns True for these
    --> 844                 result = self.shift(other)
        845 
        846             # array-like others
    
    /anaconda3/lib/python3.6/site-packages/pandas/core/indexes/datetimelike.py in shift(self, n, freq)
       1049 
       1050         if self.freq is None:
    -> 1051             raise NullFrequencyError("Cannot shift with no freq")
       1052 
       1053         start = self[0] + n * self.freq
    
    NullFrequencyError: Cannot shift with no freq