Learning a simple pattern with RNN

I am trying to make RNN in tensorflow capture a basic pattern in a simple time series in hours. I am trying to solve a bigger problem involving count time series of customer demand.

The simple time series is as follows:

Every 24 hours (1 day) there will be a small integer number either 1 or 2 from a random uniform distirbution.
In between these 24 hours will be zero values.
Every 168 hours (7 days) there will be a high integer number (5 or 6 or 7 or 8 or 9) from a random uniform distirbution.

I tried following the code at https://r2rt.com/recurrent-neural-networks-in-tensorflow-i.html using dynamic_rnn.

Is my test data correct? How can I feed the batches of output from previous times step as input to the next time step? I have 5 hyperparamters to play with

batch_size = 8 num_steps = 192 state_size = 5 learning_rate = 0.00001 num_epochs=1

However, after training each time with the same hyperparameters I am getting different results. Each time the training error is very small. The different results seem quite random (local minima probably??). orange is actual, blue is predicted.

Can my test batch start at any point in the sequence? Does the RNN learn the number of zeros inbetween non-zero values? if the test batch starts with a small non-zero number then the RNN should know that it should output 23 zero value steps after this and then after 167 steps output a high non-zero value. if I start my test sequence at 0 then it should wait 23 more zero value steps before outputing a small non-zero value and after 167 steps output a high non-zero value?

or does it learn another pattern? I am not sure if my method of testing is correct? Is it better to just pass one time step integer value and let the network generate the remaining time steps integer values by passing the current time step output as input to the next time step?

Currently, I just take a random sequence of X generated by the same method for training and check if my output Y is the shifted version of X by 1 time step. Could you please explain?

My code is given below. you can just copy and paste and it should run. Basically, I just generate the data, build the model, train the network and test it.

from data_generator import gen_data
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
import numpy as np
import time
import matplotlib.pyplot as plt

num_classes = 11

batch_size = 8
num_steps = 192
state_size = 5
learning_rate = 0.00001
num_epochs=1

dem = gen_data(len=1576)


def gen_batch(dem, batch_size, num_steps):
    raw_x = dem[:-1]
    raw_y = dem[1:]
    data_length = len(raw_x)
    
    num_of_win = data_length - num_steps - 1 # 1382 windows
    batch_partition_length =  num_of_win // batch_size # 172 batches


    data_x = []
    data_y = []

    j=0
    for i in range(batch_partition_length):
        windows_x = []
        windows_y = []
        k=0
        while(k<batch_size):
            windows_x.append( raw_x[ j:num_steps + j] )
            windows_y.append( raw_y[ j:num_steps + j] )
            j+=1
            k+=1

        data_x.append(np.array(windows_x)) # each batch is stacked horizontally.
        data_y.append(np.array(windows_y))

    for windows_x, windows_y in zip(data_x,data_x):
        x = windows_x
        y = windows_y
        z = x.shape
        z = y.shape
        yield (x, y)




def gen_epoch(num_epochs,batch_size, num_steps):

    for n in range(num_epochs):
        yield gen_batch(dem, batch_size, num_steps)

def reset_graph():
    # if 'sess' in globals() and sess:
    #     sess.close()
    tf.compat.v1.reset_default_graph()

def build_RNN_model(batch_size, num_classes,state_size,num_steps,learning_rate):
    reset_graph()

    x = tf.compat.v1.placeholder(dtype=tf.int32, shape=(batch_size,num_steps))
    y = tf.compat.v1.placeholder(dtype=tf.int32, shape=(batch_size,num_steps))
    init_state = tf.zeros([batch_size, state_size])
    # with tf.compat.v1.variable_scope('rnn_cell'):
    #     W = tf.compat.v1.get_variable('inp_state_w', shape=(num_classes+state_size,state_size),initializer=tf.compat.v1.initializers.glorot_uniform(10) )
    #     b = tf.compat.v1.get_variable('inp_state_b', shape=(state_size),initializer=tf.compat.v1.initializers.constant(0.0) )

    # def rnn_cell(rnn_input,state):
    #     with tf.compat.v1.variable_scope('rnn_cell', reuse=True):
    #         W = tf.compat.v1.get_variable('inp_state_w', shape=(num_classes+state_size,state_size),initializer=tf.compat.v1.initializers.glorot_uniform(10) )
    #         b = tf.compat.v1.get_variable('inp_state_b', shape=(state_size),initializer=tf.compat.v1.initializers.constant(0.0) )
    #     return tf.tanh( tf.matmul( tf.concat([rnn_input,state], axis=1),W) + b )

    

    #cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(state_size, reuse=True, name='rnn_cell' )


    rnn_inputs = tf.one_hot(x, num_classes)

    cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(state_size)
    rnn_outputs, final_state = tf.compat.v1.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)

    with tf.compat.v1.variable_scope('output'):
        W = tf.compat.v1.get_variable('out_state_w', shape=(state_size,num_classes),initializer=tf.compat.v1.initializers.glorot_uniform(10) )
        b = tf.compat.v1.get_variable('out_state_b', shape=(num_classes),initializer=tf.compat.v1.initializers.constant(0.0) )


    logits = tf.reshape( tf.compat.v1.matmul(tf.reshape(rnn_outputs, [-1, state_size]), W) + b, [batch_size, num_steps, num_classes])
    predictions = tf.compat.v1.nn.softmax(logits)
    tru_labels = y

    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    total_loss = tf.reduce_mean(losses)
    train_step = tf.compat.v1.train.AdagradOptimizer(learning_rate).minimize(total_loss)

    return dict(
        x=x,
        y=y,
        final_state = final_state,
        total_loss = total_loss,
        train_step = train_step,
        init_state = init_state,
        predictions = predictions,
        tru_labels = tru_labels,
        saver = tf.compat.v1.train.Saver()
        
    )

def train_network(g,num_epochs, batch_size,num_steps, dem,save=' '):
    tf.compat.v1.set_random_seed(2345)
    with tf.compat.v1.Session() as sess:
        sess.run(tf.compat.v1.initialize_all_variables())
        training_losses = []
        
        for idx, epoch in enumerate(gen_epoch(num_epochs,batch_size, num_steps)):
            training_loss = 0
            steps=0 # number of batches
            training_state = None
            for X,Y in epoch:
                steps+=1

                feed_dict = {g['x'] : X, g['y'] : Y}
                
                if training_state is not None:
                    feed_dict[g['init_state']] = training_state
         
                training_loss_, training_state, train_step = \
                    sess.run([g['total_loss'], g['final_state'], g['train_step']], feed_dict)

                training_loss+=training_loss_
                print("Average training loss for Epoch", idx, ":", training_loss/steps)
                print('steps',steps)
                training_losses.append(training_loss/steps)

        if isinstance(save, str):
            g['saver'].save(sess, save)
        

        e = gen_batch(dem, batch_size, num_steps)
        e = gen_batch(dem, batch_size, num_steps)

        for X,Y in e:
            tru_labels, predictions = \
                    sess.run([g['tru_labels'], g['predictions']], feed_dict={g['x'] : X, g['y'] : Y, g['init_state'] : training_state})
    

    pred = np.argmax(predictions, axis=2)
    print(pred.shape)
    pred = pred[0]
    print('predictions',pred)
    tru_labels = tru_labels[0]
    print('tru_labels',tru_labels )

    plt.plot(pred)    
    plt.plot(tru_labels)
    plt.show()


    return training_loss






g = build_RNN_model(batch_size, num_classes,state_size,num_steps,learning_rate)
t = time.time()
train_network(g, num_epochs,batch_size,num_steps, dem,save='saver' )
print("It took", time.time() - t, "seconds to train for 3 epochs.")

Solution

I have written some keras code with a single RNN cell and a dense layer to capture the following two patterns which is similar to the two patterns above. However, the distribution of magnitudes of high vehicles and low vehicles that are drawn from a categorical distribution below are not being represented in the test output.

Categorical Random Variable, x = {0,1,2} and p(x) = {0.6,0.3,0.1}

low vehicles = 1 + x , every 4 hours

high vehicles = 6 + x , every 8 hours

I managed to get the results like the following

with this code

from copyreg import pickle
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras as keras
import sys

#### for reproduclvle resutls
from numpy.random import seed
seed(1)
import tensorflow
tensorflow.random.set_seed(2)



n_steps = 12
batch_size = 32
lay1_state_size = 64
lay2_state_size = 0
dense_state_size = 1
num_epochs = 25
horizon = 24
loss_function_type = 'sparse_categorical_crossentropy or mse or rmse'


num_layers = 1
optimizer_type = 'Adam'
metrics = 'rmse'

# spikes at regrular interval



dem = np.load('const_dem_2_freq_stoch.npy')
dem_len = len(dem)



def gen_batch(dem, batch_size, n_steps):
    n = n_steps + 1
    raw_x = dem[:-1]
    data_length = len(raw_x)
    
    num_of_win = data_length - n - 1 # 1382 windows
    batch_partition_length =  num_of_win // batch_size # 172 batches
    #print('batch_partition_length',batch_partition_length)

    data_x = []

    j=0
    for i in range(batch_partition_length):
        windows_x = []
        k=0
        while(k<batch_size):
            windows_x.append( raw_x[ j:n + j] )
            j+=1
            k+=1
        data_x.append(np.array(windows_x)) # each batch is stacked horizontally.

    data_x = np.array(data_x)
    data_x = np.reshape(data_x,(-1,n)) # 224 x 13
    #print(data_x.shape)
    
    return data_x,batch_partition_length


data_x,batch_partition_length = gen_batch(dem, batch_size, n_steps)
data_x = np.expand_dims(data_x,axis=-1)

tr = int(0.7*dem_len)
val = int(0.2*dem_len)

x_train, y_train = data_x[:tr,:n_steps], data_x[:tr,-1]
x_valid, y_valid = data_x[tr:tr+val,:n_steps], data_x[tr:tr+val,-1]
print('\n\n')
print('tr+val',tr+val)
print('\n\n')
x_test, y_test = data_x[tr+val:,:n_steps], data_x[tr+val:,-1]




#model
model = keras.models.Sequential([keras.layers.SimpleRNN(lay1_state_size,input_shape=[None,1]), keras.layers.Dense(dense_state_size)])

# model = keras.models.Sequential([keras.layers.SimpleRNN(lay1_state_size,return_sequences=True,input_shape=[None,1]),keras.layers.SimpleRNN(lay2_state_size),
# keras.layers.Dense(dense_state_size)])

model.compile(optimizer='Adam',loss=keras.losses.mean_absolute_error ,metrics=[tf.keras.metrics.RootMeanSquaredError()] )
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,validation_data=(x_valid,y_valid))
print('\n') 
print('Model Evaluation on test set:\n')
model.evaluate(x_test, y_test,batch_size=batch_size)
print('\n') 
#model.summary()





y_tru = np.array([])
for step_ahead in range(horizon):
    # tru label
    y = np.append(data_x[step_ahead+1:,n_steps ], np.array([[0]*(step_ahead+1)]))
    y_tru = np.append(y_tru,y)
    # prediction
    y_pred_one = model.predict(data_x[:,step_ahead:])[:,np.newaxis,:]
    data_x = np.concatenate([data_x,y_pred_one ],axis=1)


y_tru = np.reshape(y_tru,(batch_partition_length*batch_size,horizon),order='F')


y_pred_horizon = data_x[:,n_steps+1:]
y_pred_horizon = np.squeeze(y_pred_horizon)
print('print(y_pred_horizon.shape)',y_pred_horizon.shape)


print(' RNN prediction on all data MSE',np.mean(keras.losses.mean_squared_error(y_tru,y_pred_horizon )) )
print(' RNN prediction on all data MAE',np.mean(keras.losses.mean_absolute_error(y_tru,y_pred_horizon )) )
print('\n') 

for i in range(10):
    plt.figure(i)
    plt.plot(y_tru[i])
    plt.plot(np.squeeze(y_pred_horizon[i]))
    plt.show()

The data generation code is given below

from copyreg import pickle
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras as keras

dem_len = 1240

def categorical(p):
    return (p.cumsum(-1) >= np.random.uniform(size=p.shape[:-1])[..., None]).argmax(-1)

p = np.array([0.6, 0.3, 0.1])

def dem_hr(hr, lo_veh, hi_veh,len):

    dem_hrs = np.array([])
    for i in range(10000):
        #d = np.random.randint(lo_veh,hi_veh)
        d = lo_veh + categorical(p)

        z = np.array([0]*(hr-1))
        dem_hrs =  np.append(dem_hrs, d)
        dem_hrs =  np.append(dem_hrs, z)

    dem_hrs = dem_hrs[:len]

    return dem_hrs


def gen_data(len):

    dzero = np.zeros(len)
    
    # for hr,lo_veh, hi_veh in zip([4, 8],[1, 6],[3,9]):
    #     d = dem_hr(hr, lo_veh, hi_veh,len)
    #     dem = dem + d
    # dem = np.array(dem,dtype=np.float32)

    d4 =  dem_hr(4, 1, 3,len)
    d8 =  dem_hr(8, 6, 9,len)
    dall =  dzero + d8
    dsub = dall - d4
    dem = np.where(dsub>=0,d8,d4)

    # plt.plot(dem)
    # plt.plot(d4)
    # plt.plot(d8)
    # plt.show()

    return dem


dem = gen_data(len=dem_len)

np.save('const_dem_2_freq_stoch_cat',dem)

plt.plot(dem)
plt.show()

I think incresing the number of steps may help to capture the distribution of magnitudes at different periods. Does increasing the layers also help to capture the magnitude distribution?