Search code examples
kerassequencelstmstateless

Keras, stateless LSTM


Here is a very simple example of LSTM in stateless mode and we train it on a very simple sequence [0–>1] and [0–>2]

Any idea why it won’t converge in stateless mode.?

We have a batch of size 2 with 2 samples and it supposed to keep the state within the batch. When predicting we would like to receive successively 1 and 2.

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM 
import numpy
# define sequences
seq = [0, 1, 0, 2]
# convert sequence into required data format. 
#We are going to extract 2 samples [0–>1] and [0–>2] and convert them into one hot vectors
seqX=numpy.array([[( 1. , 0. , 0.)], [( 1. , 0. , 0.)]])
seqY=numpy.array([( 0. , 1. , 0.) , ( 0. , 0. , 1.)])

# define LSTM configuration
n_unique = len(set(seq)) 
n_neurons = 20
n_batch = 2
n_features = n_unique #which is =3
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, input_shape=( 1, n_features)  ))
model.add(Dense(n_unique, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='Adam')
# train LSTM
model.fit(seqX, seqY, epochs=300, batch_size=n_batch, verbose=2, shuffle=False)
# evaluate LSTM 
print('Sequence')
result = model.predict_classes(seqX, batch_size=n_batch, verbose=0)
for i in range(2):
    print('X=%.1f y=%.1f, yhat=%.1f' % (0, i+1, result[i]))

Example 2 Here I want to clarify a bit what result I want.

Same code example but in stateful mode (stateful=True). It works perfectly. We feed the network 2 times with zeros and get 1 and then 2. But I want to get the same result in stateless mode as it supposed to keep the state within the batch.

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM 
import numpy
# define sequences
seq = [0, 1, 0, 2]
# convert sequences into required data format
seqX=numpy.array([[( 1. , 0. , 0.)], [( 1. , 0. , 0.)]])
seqY=numpy.array([( 0. , 1. , 0.) , ( 0. , 0. , 1.)])

# define LSTM configuration
n_unique = len(set(seq))
n_neurons = 20
n_batch = 1
n_features = n_unique
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, batch_input_shape=(n_batch, 1, n_features), stateful=True  ))
model.add(Dense(n_unique, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='Adam')
# train LSTM
for epoch in range(300):
    model.fit(seqX, seqY, epochs=1, batch_size=n_batch, verbose=2, shuffle=False)
    model.reset_states()
# evaluate LSTM 
print('Sequence')
result = model.predict_classes(seqX, batch_size=1, verbose=0)
for i in range(2):
    print('X=%.1f y=%.1f, yhat=%.1f' % (0, i+1, result[i]))

As a correct result we should get:

Sequence

X=0.0 y=1.0, yhat=1.0

X=0.0 y=2.0, yhat=2.0


Solution

  • You must feed one sequence with two steps instead of two sequences with one step:

    • One sequence, two steps: seqX.shape = (1,2,3)
    • Two sequences, one step: seqX.shape = (2,1,3)

    The input shape is (numberOfSequences, stepsPerSequence, featuresPerStep)

    seqX = [[[1,0,0],[1,0,0]]]
    

    If you want to get both steps for y as output, you must use return_sequences=True.

    LSTM(n_neurons, input_shape=( 1, n_features), return_sequences=True)
    

    The entire working code:

    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers import LSTM 
    import numpy
    
    # define sequences
    seq = [0, 1, 0, 2]
    
    # convert sequence into required data format. 
    #We are going to extract 2 samples [0–>1] and [0–>2] and convert them into one hot vectors
    seqX=numpy.array([[[ 1. , 0. , 0.], [ 1. , 0. , 0.]]])
    seqY=numpy.array([[[0. , 1. , 0.] , [ 0. , 0. , 1.]]])
        #shapes are (1,2,3) - 1 sequence, 2 steps, 3 features 
    
    # define LSTM configuration
    n_unique = len(set(seq)) 
    n_neurons = 20
    n_features = n_unique #which is =3
    #no need for batch size
    
    # create LSTM
    model = Sequential()
    
    model.add(LSTM(n_neurons, input_shape=( 2, n_features),return_sequences=True))
        #the input shape must have two steps    
    
    model.add(Dense(n_unique, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='Adam')
    
    # train LSTM
    model.fit(seqX, seqY, epochs=300, verbose=2) 
       #no shuffling and no batch size needed. 
    
    # evaluate LSTM 
    print('Sequence')
    result = model.predict_classes(seqX, verbose=0)
    print(seqX)
    print(result) #all steps are predicted in a single array (with return_sequences=True)