python tensorflow machine-learning keras lstm

Forecast future values with LSTM in Python

This code predicts the values of a specified stock up to the current date but not a date beyond the training dataset. This code is from an earlier question I had asked and so my understanding of it is rather low. I assume the solution would be a simple variable change to add the extra time but I am unaware as to which value needs to be manipulated.

import pandas as pd
import numpy as np
import yfinance as yf
import os
import matplotlib.pyplot as plt
from IPython.display import display
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

pd.options.mode.chained_assignment = None

# download the data
df = yf.download(tickers=['AAPL'], period='2y')

# split the data
train_data = df[['Close']].iloc[: - 200, :]
valid_data = df[['Close']].iloc[- 200:, :]

# scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train_data)

train_data = scaler.transform(train_data)
valid_data = scaler.transform(valid_data)

# extract the training sequences
x_train, y_train = [], []

for i in range(60, train_data.shape[0]):
    x_train.append(train_data[i - 60: i, 0])
    y_train.append(train_data[i, 0])

x_train = np.array(x_train)
y_train = np.array(y_train)

# extract the validation sequences
x_valid = []

for i in range(60, valid_data.shape[0]):
    x_valid.append(valid_data[i - 60: i, 0])

x_valid = np.array(x_valid)

# reshape the sequences
x_train = x_train.reshape(x_train.shape[0], 
x_train.shape[1], 1)
x_valid = x_valid.reshape(x_valid.shape[0], 
x_valid.shape[1], 1)

# train the model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, 
input_shape=x_train.shape[1:]))
model.add(LSTM(units=50))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=50, batch_size=128, verbose=1)

# generate the model predictions
y_pred = model.predict(x_valid)
y_pred = scaler.inverse_transform(y_pred)
y_pred = y_pred.flatten()

# plot the model predictions
df.rename(columns={'Close': 'Actual'}, inplace=True)
df['Predicted'] = np.nan
df['Predicted'].iloc[- y_pred.shape[0]:] = y_pred
df[['Actual', 'Predicted']].plot(title='AAPL')

display(df)

plt.show()

Solution

You could train your model to predict a future sequence (e.g. the next 30 days) instead of predicting the next value (the next day) as it is currently the case.

In order to do that, you need to define the outputs as y[t: t + H] (instead of y[t] as in the current code) where y is the time series and H is the length of the forecast period (i.e. the number of days ahead that you want to forecast). You also need to set the number of outputs of the last layer equal to H (instead of equal to 1 as in the current code).

You can still define the inputs as y[t - T: t] where T is the length of the lookback period (or number of timesteps), and therefore the model's input shape is still (T, 1). The lookback period T is usually longer than the forecast period H (i.e. T > H) and it's often set equal to a multiple of H (i.e. T = m * H where m > 1 is an integer.).

import numpy as np
import pandas as pd
import yfinance as yf
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
pd.options.mode.chained_assignment = None
tf.random.set_seed(0)

# download the data
df = yf.download(tickers=['AAPL'], period='1y')
y = df['Close'].fillna(method='ffill')
y = y.values.reshape(-1, 1)

# scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(y)
y = scaler.transform(y)

# generate the input and output sequences
n_lookback = 60  # length of input sequences (lookback period)
n_forecast = 30  # length of output sequences (forecast period)

X = []
Y = []

for i in range(n_lookback, len(y) - n_forecast + 1):
    X.append(y[i - n_lookback: i])
    Y.append(y[i: i + n_forecast])

X = np.array(X)
Y = np.array(Y)

# fit the model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(n_lookback, 1)))
model.add(LSTM(units=50))
model.add(Dense(n_forecast))

model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X, Y, epochs=100, batch_size=32, verbose=0)

# generate the forecasts
X_ = y[- n_lookback:]  # last available input sequence
X_ = X_.reshape(1, n_lookback, 1)

Y_ = model.predict(X_).reshape(-1, 1)
Y_ = scaler.inverse_transform(Y_)

# organize the results in a data frame
df_past = df[['Close']].reset_index()
df_past.rename(columns={'index': 'Date', 'Close': 'Actual'}, inplace=True)
df_past['Date'] = pd.to_datetime(df_past['Date'])
df_past['Forecast'] = np.nan
df_past['Forecast'].iloc[-1] = df_past['Actual'].iloc[-1]

df_future = pd.DataFrame(columns=['Date', 'Actual', 'Forecast'])
df_future['Date'] = pd.date_range(start=df_past['Date'].iloc[-1] + pd.Timedelta(days=1), periods=n_forecast)
df_future['Forecast'] = Y_.flatten()
df_future['Actual'] = np.nan

results = df_past.append(df_future).set_index('Date')

# plot the results
results.plot(title='AAPL')

See this answer for a different approach.