Search code examples
pythonpandasforecastingfacebook-prophetholtwinters

for loop having issue with holt-winters exponential smoothing


when I run my individual models with different training and test data my model works fine. I wanted to run a for loop and now I am getting the error not sure why.

I have created several time splits to check how the model is performing with different data breakdowns.


# dataframe opertations - pandas
import pandas as pd
# plotting data - matplotlib
from matplotlib import pyplot as plt
# time series - statsmodels 
# Seasonality decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.seasonal import seasonal_decompose 
# holt winters 
# single exponential smoothing
from statsmodels.tsa.holtwinters import SimpleExpSmoothing   
# double and triple exponential smoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing

from numpy import sqrt 
from sklearn.metrics import mean_squared_error

df = pd.read_csv('/content/hw-cv-imputed.csv',index_col='date', parse_dates=True)
df.index.freq = 'W-FRI'
# finding shape of the dataframe
print(df.shape)
# having a look at the data
print(df.head())
# plotting the original data
df[['visits']].plot(title='visit Data')

#Splitting according to the above description
train1, test1 = df.iloc[:52, 0], df.iloc[52:62, 0]
train2, test2 = df.iloc[:56, 0], df.iloc[56:66, 0]
train3, test3 = df.iloc[:60, 0], df.iloc[60:70, 0]
train4, test4 = df.iloc[:65, 0], df.iloc[65:75, 0]
train5, test5 = df.iloc[:69, 0], df.iloc[69:79, 0]
train6, test6 = df.iloc[:73, 0], df.iloc[73:83, 0]
train7, test7 = df.iloc[:78, 0], df.iloc[78:88, 0]
train8, test8 = df.iloc[:82, 0], df.iloc[82:90, 0]
total_model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
# Split into train and test set
#train_df = train1
#test_df = test1

from sklearn.model_selection import ParameterGrid
for train_df ,test_df in [('train1','test1'),('train2','test2'),('train3','test3'),('train4','test4'),('train5','test5'),('train6','test6'),('train7','test7')]:
  params_grid = {'trend':('mul','add'),
                'seasonal':('mul','add'),
                'seasonal_periods': [10,12]}
  grid = ParameterGrid(params_grid)
  cnt = 0
  for p in grid:
      cnt = cnt+1

  print('Total Possible Models',cnt)

  model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
  for p in grid:
      test = pd.DataFrame()
      print(p)
      **fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()**
      test_predictions = fitted_model.forecast(10)
      df_new = pd.concat((test_df,test_predictions.rename('predicted_visits'),(((test_df-test_predictions)/test_df)*100).rename('error')),axis=1)
      def accuracy(row):  
          if  abs(row['error']) < 20:
              return 1
          return 0
      df_new['accuracy'] = df_new.apply(lambda row: accuracy(row), axis=1)
      Total = df_new['accuracy'].sum()
      print('Accuracy------------------------------------',Total)
      model_parameters = model_parameters.append({'Total':Total,'Parameters':p},ignore_index=True)

  parameters = model_parameters.sort_values(by=['Total'],ascending=False)
  parameters = parameters.reset_index(drop=True)
  parameters.head(9)

  Parameters_1 = pd.DataFrame(parameters)
  Parameters_1
  parameters['Parameters'][0]
  total_model_parameters = total_model_parameters.append(parameters)
total_model_parameters

The error is

for the line - *fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()*
ValueError: unrecognized data structures: <class 'str'> / <class 'NoneType'>

Can someone help, please? :)

p.s. The data is as follows

date    visits
1/22/2021   7352070
1/29/2021   7063725
2/5/2021    9385950
2/12/2021   7851435
2/19/2021   9509640
2/26/2021   9919170
3/5/2021    9682125
3/12/2021   9597075
3/19/2021   8189835
3/26/2021   7487385
4/2/2021    8863965
4/9/2021    8856165
4/16/2021   8619345
4/23/2021   4499670
4/30/2021   3642705
5/7/2021    3105690
5/14/2021   3096330
5/21/2021   3240360
5/28/2021   5152410
6/4/2021    6471915
6/11/2021   4401030
6/18/2021   3197775
6/25/2021   2606340
7/2/2021    3248460
7/9/2021    4996425
7/16/2021   7775085
7/23/2021   9690795
7/30/2021   10041555
8/6/2021    11849055
8/13/2021   14598750
8/20/2021   15339390
8/27/2021   20118720
9/3/2021    12731115
9/10/2021   17456475
9/17/2021   20393850
9/24/2021   20537895
10/1/2021   20800935
10/8/2021   25035450
10/15/2021  22872450
10/22/2021  22790130
10/29/2021  22036965
11/5/2021   26988975
11/12/2021  29194530
11/19/2021  26106000
11/26/2021  29928660
12/3/2021   29254335
12/10/2021  32165430
12/17/2021  27303570
12/24/2021  21453585
12/31/2021  21568815
1/7/2022    21286680
1/14/2022   25589715
1/21/2022   21890130
1/28/2022   20881515
2/4/2022    24185835
2/11/2022   24160590
2/18/2022   20253360
2/25/2022   20450910
3/4/2022    26542320
3/11/2022   25540335
3/18/2022   29602380
3/25/2022   32258340
4/1/2022    24953640
4/8/2022    22872165
4/15/2022   25784490
4/22/2022   25168356
4/29/2022   25405687
5/6/2022    24693295
5/13/2022   26374944
5/20/2022   26192271
5/27/2022   26868125
6/3/2022    27948287
6/10/2022   28320595
6/17/2022   28153788
6/24/2022   27470327
7/1/2022    30520950
7/8/2022    28635750
7/15/2022   26269140
7/22/2022   24236250
7/29/2022   20541675
8/5/2022    21190020
8/12/2022   22389675
8/19/2022   24496455
8/26/2022   27555645
9/2/2022    26324760
9/9/2022    32937450
9/16/2022   36577425
9/23/2022   33522000
9/30/2022   30759780
10/7/2022   30615870


Solution

  • The problem is that you have ' quoted your variable names so that

    for train_df ,test_df in [('train1','test1'),...]
    

    shouldn't have the 's.

    You can do away with that line if you're happy to put your pairs of training and test data into a list of tuples like this

    import pandas as pd
    from sklearn.model_selection import ParameterGrid
    from statsmodels.tsa.holtwinters import ExponentialSmoothing
    
    df = pd.read_csv("hw-cv-imputed.csv", index_col="date", parse_dates=True)
    df.index.freq = "W-FRI"
    
    # finding shape of the dataframe
    print(df.shape)
    
    # having a look at the data
    print(df.head())
    
    # plotting the original data
    df[["visits"]].plot(title="visit Data")
    
    # Splitting according to the above description
    train_and_test = []
    train_and_test.append((df.iloc[:52, 0], df.iloc[52:62, 0]))
    train_and_test.append((df.iloc[:56, 0], df.iloc[56:66, 0]))
    train_and_test.append((df.iloc[:60, 0], df.iloc[60:70, 0]))
    train_and_test.append((df.iloc[:65, 0], df.iloc[65:75, 0]))
    train_and_test.append((df.iloc[:69, 0], df.iloc[69:79, 0]))
    train_and_test.append((df.iloc[:73, 0], df.iloc[73:83, 0]))
    train_and_test.append((df.iloc[:78, 0], df.iloc[78:88, 0]))
    train_and_test.append((df.iloc[:82, 0], df.iloc[82:90, 0]))
    total_model_parameters = pd.DataFrame(columns=["Total", "Parameters"])
    
    for train_df, test_df in train_and_test:
        params_grid = {
            "trend": ("mul", "add"),
            "seasonal": ("mul", "add"),
            "seasonal_periods": [10, 12],
        }
        grid = ParameterGrid(params_grid)
        cnt = 0
        for p in grid:
            cnt = cnt + 1
    
        print("Total Possible Models", cnt)
    
        model_parameters = pd.DataFrame(columns=["Total", "Parameters"])
        for p in grid:
            ...