Search code examples
pythontensorflowkerastime-serieslstm

How to predict future data or data of an unknown range after training an LSTM model with a time series dataset?


I have trained a neural network with an LSTM model using a time series dataset. I am using this dataset which records data from 1970-2016 of the daily rainfall of 35 locations : https://www.kaggle.com/redikod/historical-rainfall-data-in-bangladesh

It looks like this :

          StationIndex    Station   Year  Month Day Rainfall dayofyear
1970-01-01  1               Dhaka   1970    1   1   0           1
1970-01-02  1               Dhaka   1970    1   2   0           2
1970-01-03  1               Dhaka   1970    1   3   0           3
1970-01-04  1               Dhaka   1970    1   4   0           4
1970-01-05  1               Dhaka   1970    1   5   0           5

I have completed the training using the train and test data. Then I checked the predicted value against true value. Here is the complete code, sorry if it is messy. I have put comments for each section:

import numpy as np
from pandas.plotting import register_matplotlib_converters
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
from pylab import rcParams
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras.layers import (
    Input,
    Dense,
    LSTM,
    AveragePooling1D,
    TimeDistributed,
    Flatten,
    Bidirectional,
    Dropout
)
from keras.models import Model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

tf.keras.backend.clear_session()
register_matplotlib_converters()
sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 22, 10

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

#reading from CSV
df = pd.read_csv("\customized_daily_rainfall_data_Copy.csv")
#droping bad data
df = df[df.Rainfall != -999]

#droping dates (leapyear, wrong day numbers of month)
df.drop(df[(df['Day']>28) & (df['Month']==2) & (df['Year']%4!=0)].index,inplace=True)
df.drop(df[(df['Day']>29) & (df['Month']==2) & (df['Year']%4==0)].index,inplace=True)
df.drop(df[(df['Day']>30) & ((df['Month']==4)|(df['Month']==6)|(df['Month']==9)|(df['Month']==11))].index,inplace=True)

#date parcing (Index)
date = [str(y)+'-'+str(m)+'-'+str(d) for y, m, d in zip(df.Year, df.Month, df.Day)]
df.index = pd.to_datetime(date)

df['Date'] = df.index
df['Dayofyear']=df['Date'].dt.dayofyear
df.drop('Date',axis=1,inplace=True)
df.drop(['Station'],axis=1,inplace=True)
df.head()


#limiting the dataframe to just rows where StationIndex is 11
datarange = df.loc[df['StationIndex'] == 11]

#splitting train and test set
train_size = int(len(datarange) * 0.9)
test_size = len(datarange) - train_size
train, test = df.iloc[0:train_size], df.iloc[train_size:len(datarange)]

#Scaling the feature and label columns of the dataset
from sklearn.preprocessing import RobustScaler
f_columns = ['Year', 'Month','Day','Dayofyear']
f_transformer = RobustScaler()
l_transformer = RobustScaler()
f_transformer = f_transformer.fit(train[f_columns].to_numpy())
l_transformer = l_transformer.fit(train[['Rainfall']])


train.loc[:, f_columns] = f_transformer.transform(train[f_columns].to_numpy())
train['Rainfall'] = l_transformer.transform(train[['Rainfall']])
test.loc[:, f_columns] = f_transformer.transform(test[f_columns].to_numpy())
test['Rainfall'] = l_transformer.transform(test[['Rainfall']])

#making smaller train and test sections withing the dataset
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].to_numpy()
        Xs.append(v)        
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 7

# reshape to [samples, time_steps, n_features]

X_train, y_train = create_dataset(train, train.Rainfall, time_steps)
X_test, y_test = create_dataset(test, test.Rainfall, time_steps)

#testing
X_test[0][0]


#model code

model = keras.Sequential()

#3 biderectional LSTM layers
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences = True)))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128,  return_sequences = True)))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128 )))
model.add(keras.layers.Dropout(rate=0.1))
model.add(keras.layers.Dense(units=1))
model.compile(loss="mean_squared_error", optimizer="RMSprop")

#training the model
history = model.fit(
    X_train, y_train, 
    epochs=500, 
    batch_size=1052, 
    validation_split=0.2,
    shuffle=False,
)

#saving the model
from tensorflow.keras.models import load_model
model.save("\Timeseries-timestep7-batchsize1052.h5")

#Using text dataset to do a prediction
y_pred = model.predict(X_test)

#inverst transformation
y_train_inv = l_transformer.inverse_transform(y_train.reshape(1, -1))
y_test_inv = l_transformer.inverse_transform(y_test.reshape(1, -1))
y_pred_inv = l_transformer.inverse_transform(y_pred)

#score
from sklearn import metrics
score = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
print(score)

What I want to do next, is for the predictions to go beyond the data in the dataset.

For example, using the model that I have trained to predict data of the future. Or possibly a random/custom range. Suppose I want to predict the day by day rainfall data of 2017. Or get the predicted data of 25-02-2017. Or maybe the data of X number of days after the end of the dataset.

Is there a good intuitive way to do that? Thank you in advanced to whoever can answer this question. It's been bugging me for a few days now.


Solution

  • The code shown below should serve your purpose.

    class WindowGenerator():
      def __init__(self, input_width, label_width, shift,
                   train_df=train_df, val_df=val_df, test_df=test_df,
                   label_columns=None):
        # Store the raw data.
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
    
        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
          self.label_columns_indices = {name: i for i, name in
                                        enumerate(label_columns)}
        self.column_indices = {name: i for i, name in
                               enumerate(train_df.columns)}
    
        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift
    
        self.total_window_size = input_width + shift
    
        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]
    
        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
    
      def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])
    

    For example, if your Data is from the years, 1960 (Jan 1st) to 2016 (Dec 31st) and that you want to predict the weather of the entire February Month for the year 2017, by considering a Data Window of past 2 Years, the values of the arguments of the above class are shown below:

    input_width: 2 Years => 365 * 2 = 730
    label_width: Entire Feb Month => 28
    shift: We are not predicting from Jan 1st 2017 but are shifting by entire Month of Jan => 30
    train_df, test_df, val_df => Self Explanatory
    label_columns : Name of the Target Column
    

    For more information, please refer this Tensorflow Tutorial on Time Series Analysis.