I have trained a neural network with an LSTM model using a time series dataset. I am using this dataset which records data from 1970-2016 of the daily rainfall of 35 locations : https://www.kaggle.com/redikod/historical-rainfall-data-in-bangladesh
It looks like this :
StationIndex Station Year Month Day Rainfall dayofyear
1970-01-01 1 Dhaka 1970 1 1 0 1
1970-01-02 1 Dhaka 1970 1 2 0 2
1970-01-03 1 Dhaka 1970 1 3 0 3
1970-01-04 1 Dhaka 1970 1 4 0 4
1970-01-05 1 Dhaka 1970 1 5 0 5
I have completed the training using the train and test data. Then I checked the predicted value against true value. Here is the complete code, sorry if it is messy. I have put comments for each section:
import numpy as np
from pandas.plotting import register_matplotlib_converters
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
from pylab import rcParams
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras.layers import (
Input,
Dense,
LSTM,
AveragePooling1D,
TimeDistributed,
Flatten,
Bidirectional,
Dropout
)
from keras.models import Model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
tf.keras.backend.clear_session()
register_matplotlib_converters()
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 22, 10
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
#reading from CSV
df = pd.read_csv("\customized_daily_rainfall_data_Copy.csv")
#droping bad data
df = df[df.Rainfall != -999]
#droping dates (leapyear, wrong day numbers of month)
df.drop(df[(df['Day']>28) & (df['Month']==2) & (df['Year']%4!=0)].index,inplace=True)
df.drop(df[(df['Day']>29) & (df['Month']==2) & (df['Year']%4==0)].index,inplace=True)
df.drop(df[(df['Day']>30) & ((df['Month']==4)|(df['Month']==6)|(df['Month']==9)|(df['Month']==11))].index,inplace=True)
#date parcing (Index)
date = [str(y)+'-'+str(m)+'-'+str(d) for y, m, d in zip(df.Year, df.Month, df.Day)]
df.index = pd.to_datetime(date)
df['Date'] = df.index
df['Dayofyear']=df['Date'].dt.dayofyear
df.drop('Date',axis=1,inplace=True)
df.drop(['Station'],axis=1,inplace=True)
df.head()
#limiting the dataframe to just rows where StationIndex is 11
datarange = df.loc[df['StationIndex'] == 11]
#splitting train and test set
train_size = int(len(datarange) * 0.9)
test_size = len(datarange) - train_size
train, test = df.iloc[0:train_size], df.iloc[train_size:len(datarange)]
#Scaling the feature and label columns of the dataset
from sklearn.preprocessing import RobustScaler
f_columns = ['Year', 'Month','Day','Dayofyear']
f_transformer = RobustScaler()
l_transformer = RobustScaler()
f_transformer = f_transformer.fit(train[f_columns].to_numpy())
l_transformer = l_transformer.fit(train[['Rainfall']])
train.loc[:, f_columns] = f_transformer.transform(train[f_columns].to_numpy())
train['Rainfall'] = l_transformer.transform(train[['Rainfall']])
test.loc[:, f_columns] = f_transformer.transform(test[f_columns].to_numpy())
test['Rainfall'] = l_transformer.transform(test[['Rainfall']])
#making smaller train and test sections withing the dataset
def create_dataset(X, y, time_steps=1):
Xs, ys = [], []
for i in range(len(X) - time_steps):
v = X.iloc[i:(i + time_steps)].to_numpy()
Xs.append(v)
ys.append(y.iloc[i + time_steps])
return np.array(Xs), np.array(ys)
time_steps = 7
# reshape to [samples, time_steps, n_features]
X_train, y_train = create_dataset(train, train.Rainfall, time_steps)
X_test, y_test = create_dataset(test, test.Rainfall, time_steps)
#testing
X_test[0][0]
#model code
model = keras.Sequential()
#3 biderectional LSTM layers
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences = True)))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128, return_sequences = True)))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128 )))
model.add(keras.layers.Dropout(rate=0.1))
model.add(keras.layers.Dense(units=1))
model.compile(loss="mean_squared_error", optimizer="RMSprop")
#training the model
history = model.fit(
X_train, y_train,
epochs=500,
batch_size=1052,
validation_split=0.2,
shuffle=False,
)
#saving the model
from tensorflow.keras.models import load_model
model.save("\Timeseries-timestep7-batchsize1052.h5")
#Using text dataset to do a prediction
y_pred = model.predict(X_test)
#inverst transformation
y_train_inv = l_transformer.inverse_transform(y_train.reshape(1, -1))
y_test_inv = l_transformer.inverse_transform(y_test.reshape(1, -1))
y_pred_inv = l_transformer.inverse_transform(y_pred)
#score
from sklearn import metrics
score = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
print(score)
What I want to do next, is for the predictions to go beyond the data in the dataset.
For example, using the model that I have trained to predict data of the future. Or possibly a random/custom range. Suppose I want to predict the day by day rainfall data of 2017. Or get the predicted data of 25-02-2017. Or maybe the data of X number of days after the end of the dataset.
Is there a good intuitive way to do that? Thank you in advanced to whoever can answer this question. It's been bugging me for a few days now.
The code shown below should serve your purpose.
class WindowGenerator():
def __init__(self, input_width, label_width, shift,
train_df=train_df, val_df=val_df, test_df=test_df,
label_columns=None):
# Store the raw data.
self.train_df = train_df
self.val_df = val_df
self.test_df = test_df
# Work out the label column indices.
self.label_columns = label_columns
if label_columns is not None:
self.label_columns_indices = {name: i for i, name in
enumerate(label_columns)}
self.column_indices = {name: i for i, name in
enumerate(train_df.columns)}
# Work out the window parameters.
self.input_width = input_width
self.label_width = label_width
self.shift = shift
self.total_window_size = input_width + shift
self.input_slice = slice(0, input_width)
self.input_indices = np.arange(self.total_window_size)[self.input_slice]
self.label_start = self.total_window_size - self.label_width
self.labels_slice = slice(self.label_start, None)
self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
def __repr__(self):
return '\n'.join([
f'Total window size: {self.total_window_size}',
f'Input indices: {self.input_indices}',
f'Label indices: {self.label_indices}',
f'Label column name(s): {self.label_columns}'])
For example, if your Data is from the years, 1960 (Jan 1st) to 2016 (Dec 31st)
and that you want to predict
the weather of the entire February Month for the year 2017, by considering a Data Window of past 2 Years, the values of the arguments of the above class are shown below:
input_width: 2 Years => 365 * 2 = 730
label_width: Entire Feb Month => 28
shift: We are not predicting from Jan 1st 2017 but are shifting by entire Month of Jan => 30
train_df, test_df, val_df => Self Explanatory
label_columns : Name of the Target Column
For more information, please refer this Tensorflow Tutorial on Time Series Analysis.