Search code examples
pythontensorflowmachine-learningkeraslstm

Forecast future values with LSTM in Python


This code predicts the values of a specified stock up to the current date but not a date beyond the training dataset. This code is from an earlier question I had asked and so my understanding of it is rather low. I assume the solution would be a simple variable change to add the extra time but I am unaware as to which value needs to be manipulated.

import pandas as pd
import numpy as np
import yfinance as yf
import os
import matplotlib.pyplot as plt
from IPython.display import display
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

pd.options.mode.chained_assignment = None

# download the data
df = yf.download(tickers=['AAPL'], period='2y')

# split the data
train_data = df[['Close']].iloc[: - 200, :]
valid_data = df[['Close']].iloc[- 200:, :]

# scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train_data)

train_data = scaler.transform(train_data)
valid_data = scaler.transform(valid_data)

# extract the training sequences
x_train, y_train = [], []

for i in range(60, train_data.shape[0]):
    x_train.append(train_data[i - 60: i, 0])
    y_train.append(train_data[i, 0])

x_train = np.array(x_train)
y_train = np.array(y_train)

# extract the validation sequences
x_valid = []

for i in range(60, valid_data.shape[0]):
    x_valid.append(valid_data[i - 60: i, 0])

x_valid = np.array(x_valid)

# reshape the sequences
x_train = x_train.reshape(x_train.shape[0], 
x_train.shape[1], 1)
x_valid = x_valid.reshape(x_valid.shape[0], 
x_valid.shape[1], 1)

# train the model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, 
input_shape=x_train.shape[1:]))
model.add(LSTM(units=50))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=50, batch_size=128, verbose=1)

# generate the model predictions
y_pred = model.predict(x_valid)
y_pred = scaler.inverse_transform(y_pred)
y_pred = y_pred.flatten()

# plot the model predictions
df.rename(columns={'Close': 'Actual'}, inplace=True)
df['Predicted'] = np.nan
df['Predicted'].iloc[- y_pred.shape[0]:] = y_pred
df[['Actual', 'Predicted']].plot(title='AAPL')

display(df)

plt.show()

Solution

  • You could train your model to predict a future sequence (e.g. the next 30 days) instead of predicting the next value (the next day) as it is currently the case.

    In order to do that, you need to define the outputs as y[t: t + H] (instead of y[t] as in the current code) where y is the time series and H is the length of the forecast period (i.e. the number of days ahead that you want to forecast). You also need to set the number of outputs of the last layer equal to H (instead of equal to 1 as in the current code).

    You can still define the inputs as y[t - T: t] where T is the length of the lookback period (or number of timesteps), and therefore the model's input shape is still (T, 1). The lookback period T is usually longer than the forecast period H (i.e. T > H) and it's often set equal to a multiple of H (i.e. T = m * H where m > 1 is an integer.).

    enter image description here

    import numpy as np
    import pandas as pd
    import yfinance as yf
    import tensorflow as tf
    from tensorflow.keras.layers import Dense, LSTM
    from tensorflow.keras.models import Sequential
    from sklearn.preprocessing import MinMaxScaler
    pd.options.mode.chained_assignment = None
    tf.random.set_seed(0)
    
    # download the data
    df = yf.download(tickers=['AAPL'], period='1y')
    y = df['Close'].fillna(method='ffill')
    y = y.values.reshape(-1, 1)
    
    # scale the data
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(y)
    y = scaler.transform(y)
    
    # generate the input and output sequences
    n_lookback = 60  # length of input sequences (lookback period)
    n_forecast = 30  # length of output sequences (forecast period)
    
    X = []
    Y = []
    
    for i in range(n_lookback, len(y) - n_forecast + 1):
        X.append(y[i - n_lookback: i])
        Y.append(y[i: i + n_forecast])
    
    X = np.array(X)
    Y = np.array(Y)
    
    # fit the model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(n_lookback, 1)))
    model.add(LSTM(units=50))
    model.add(Dense(n_forecast))
    
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X, Y, epochs=100, batch_size=32, verbose=0)
    
    # generate the forecasts
    X_ = y[- n_lookback:]  # last available input sequence
    X_ = X_.reshape(1, n_lookback, 1)
    
    Y_ = model.predict(X_).reshape(-1, 1)
    Y_ = scaler.inverse_transform(Y_)
    
    # organize the results in a data frame
    df_past = df[['Close']].reset_index()
    df_past.rename(columns={'index': 'Date', 'Close': 'Actual'}, inplace=True)
    df_past['Date'] = pd.to_datetime(df_past['Date'])
    df_past['Forecast'] = np.nan
    df_past['Forecast'].iloc[-1] = df_past['Actual'].iloc[-1]
    
    df_future = pd.DataFrame(columns=['Date', 'Actual', 'Forecast'])
    df_future['Date'] = pd.date_range(start=df_past['Date'].iloc[-1] + pd.Timedelta(days=1), periods=n_forecast)
    df_future['Forecast'] = Y_.flatten()
    df_future['Actual'] = np.nan
    
    results = df_past.append(df_future).set_index('Date')
    
    # plot the results
    results.plot(title='AAPL')
    

    enter image description here

    See this answer for a different approach.