Keras, stateless LSTM

Here is a very simple example of LSTM in stateless mode and we train it on a very simple sequence [0–>1] and [0–>2]

Any idea why it won’t converge in stateless mode.?

We have a batch of size 2 with 2 samples and it supposed to keep the state within the batch. When predicting we would like to receive successively 1 and 2.

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM 
import numpy
# define sequences
seq = [0, 1, 0, 2]
# convert sequence into required data format. 
#We are going to extract 2 samples [0–>1] and [0–>2] and convert them into one hot vectors
seqX=numpy.array([[( 1. , 0. , 0.)], [( 1. , 0. , 0.)]])
seqY=numpy.array([( 0. , 1. , 0.) , ( 0. , 0. , 1.)])

# define LSTM configuration
n_unique = len(set(seq)) 
n_neurons = 20
n_batch = 2
n_features = n_unique #which is =3
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, input_shape=( 1, n_features)  ))
model.add(Dense(n_unique, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='Adam')
# train LSTM
model.fit(seqX, seqY, epochs=300, batch_size=n_batch, verbose=2, shuffle=False)
# evaluate LSTM 
print('Sequence')
result = model.predict_classes(seqX, batch_size=n_batch, verbose=0)
for i in range(2):
    print('X=%.1f y=%.1f, yhat=%.1f' % (0, i+1, result[i]))

Example 2 Here I want to clarify a bit what result I want.

Same code example but in stateful mode (stateful=True). It works perfectly. We feed the network 2 times with zeros and get 1 and then 2. But I want to get the same result in stateless mode as it supposed to keep the state within the batch.

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM 
import numpy
# define sequences
seq = [0, 1, 0, 2]
# convert sequences into required data format
seqX=numpy.array([[( 1. , 0. , 0.)], [( 1. , 0. , 0.)]])
seqY=numpy.array([( 0. , 1. , 0.) , ( 0. , 0. , 1.)])

# define LSTM configuration
n_unique = len(set(seq))
n_neurons = 20
n_batch = 1
n_features = n_unique
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, batch_input_shape=(n_batch, 1, n_features), stateful=True  ))
model.add(Dense(n_unique, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='Adam')
# train LSTM
for epoch in range(300):
    model.fit(seqX, seqY, epochs=1, batch_size=n_batch, verbose=2, shuffle=False)
    model.reset_states()
# evaluate LSTM 
print('Sequence')
result = model.predict_classes(seqX, batch_size=1, verbose=0)
for i in range(2):
    print('X=%.1f y=%.1f, yhat=%.1f' % (0, i+1, result[i]))

As a correct result we should get:

Sequence

X=0.0 y=1.0, yhat=1.0

X=0.0 y=2.0, yhat=2.0

Solution

You must feed one sequence with two steps instead of two sequences with one step:

One sequence, two steps: seqX.shape = (1,2,3)
Two sequences, one step: seqX.shape = (2,1,3)

The input shape is (numberOfSequences, stepsPerSequence, featuresPerStep)

seqX = [[[1,0,0],[1,0,0]]]

If you want to get both steps for y as output, you must use return_sequences=True.

LSTM(n_neurons, input_shape=( 1, n_features), return_sequences=True)

The entire working code:

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM 
import numpy

# define sequences
seq = [0, 1, 0, 2]

# convert sequence into required data format. 
#We are going to extract 2 samples [0–>1] and [0–>2] and convert them into one hot vectors
seqX=numpy.array([[[ 1. , 0. , 0.], [ 1. , 0. , 0.]]])
seqY=numpy.array([[[0. , 1. , 0.] , [ 0. , 0. , 1.]]])
    #shapes are (1,2,3) - 1 sequence, 2 steps, 3 features 

# define LSTM configuration
n_unique = len(set(seq)) 
n_neurons = 20
n_features = n_unique #which is =3
#no need for batch size

# create LSTM
model = Sequential()

model.add(LSTM(n_neurons, input_shape=( 2, n_features),return_sequences=True))
    #the input shape must have two steps    

model.add(Dense(n_unique, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='Adam')

# train LSTM
model.fit(seqX, seqY, epochs=300, verbose=2) 
   #no shuffling and no batch size needed. 

# evaluate LSTM 
print('Sequence')
result = model.predict_classes(seqX, verbose=0)
print(seqX)
print(result) #all steps are predicted in a single array (with return_sequences=True)