python tensorflow machine-learning keras lstm

LSTM occurs ValueError: Shapes (5, 2, 3) and (5, 3) are incompatible

I want to do time series multi-class classification with time-series data. Here the data set I have got needs to be preprocessed heavily and that just to get an idea of how to implement the model I have used the IRIS data set(not suitable for LSTM) since it has the exact same structure of the time series data I have( 4 input features,1 output feature, 120 samples). I have the following code implemented but it causes me the invalid shape error when fitting the model with a batch size of 5 (changed the batch size many times but didn't seem to make any change)

#load dataset
    dataframe = pandas.read_csv("iris.csv",header=None)
    dataset = dataframe.values
    X=dataset[:,0:4].astype(float)
    Y=dataset[:,4]

# Encode the output variables
    encoder = LabelEncoder()
    encoder.fit(Y)
    # convert output variables into the numbers
    encoded_Y = encoder.transform(Y)
    # Convert integers to dummy variables (one-hot encoded)
    dummy_Y = np_utils.to_categorical(encoded_Y)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,dummy_Y,test_size=0.2) #20% is allocated for the testing

X_train = X_train.reshape(60, 2, 4)
y_train = y_train.reshape(60, 2, 3)
y_train.shape,X_train.shape

((60, 2, 3), (60, 2, 4))


 # Create the Neural Network Model
def create_nn_model():
#create sequential model
  model = Sequential()
  model.add(LSTM(100,dropout=0.2, input_shape=(X_train.shape[1],X_train.shape[2])))
  model.add(Dense(100, activation='relu'))
  model.add(Dense(3,activation='softmax'))
  # Compile model
  model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
  return model

model = create_nn_model()
model.summary()


> Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, 100)               42000     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 303       
=================================================================
Total params: 52,403
Trainable params: 52,403
Non-trainable params: 0

model.fit(X_train,y_train,epochs=200,batch_size=5)


> ValueError                                Traceback (most recent call last)

<ipython-input-26-0aef33c299f0> in <module>()
----> 1 model.fit(X_train,y_train,epochs=200,batch_size=5) #X_train is independant variables. based on the amount of the data set data set will be trained by breaking into batches

9 frames

/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
    984           except Exception as e:  # pylint:disable=broad-except
    985             if hasattr(e, "ag_error_metadata"):
--> 986               raise e.ag_error_metadata.to_exception(e)
    987             else:
    988               raise

ValueError: in user code:

    /usr/local/lib/python3.7/dist-packages/keras/engine/training.py:830 train_function  *
        return step_function(self, iterator)
    /usr/local/lib/python3.7/dist-packages/keras/engine/training.py:813 run_step  *
        outputs = model.train_step(data)
    /usr/local/lib/python3.7/dist-packages/keras/engine/training.py:771 train_step  *
        loss = self.compiled_loss(
    /usr/local/lib/python3.7/dist-packages/keras/engine/compile_utils.py:201 __call__  *
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /usr/local/lib/python3.7/dist-packages/keras/losses.py:142 __call__  *
        losses = call_fn(y_true, y_pred)
    /usr/local/lib/python3.7/dist-packages/keras/losses.py:246 call  *
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py:206 wrapper  **
        return target(*args, **kwargs)
    /usr/local/lib/python3.7/dist-packages/keras/losses.py:1631 categorical_crossentropy
        y_true, y_pred, from_logits=from_logits)
    /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /usr/local/lib/python3.7/dist-packages/keras/backend.py:4827 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    /usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/tensor_shape.py:1161 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (5, 2, 3) and (5, 3) are incompatible

Solution

Your y_true and y_pred are not in the same shape. You may need to define your LSTM in the following way

model.add(LSTM(100,dropout=0.2, input_shape=(2,4), return_sequences=True))
....

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
....
dense_3 (Dense)              (None, 2, 3)              303        < ---
=================================================================

Update

Using return_sequences = True would work because you define your Training-Paris in that way:

X_train = X_train.reshape(60, 2, 4)
y_train = y_train.reshape(60, 2, 3)

which represent (batch_size, timestep, input_lenght); but note that you need to reshape or fulfill the input requirement of the LSTM layer in your above model and not the y_train. However, when you define your model, you don't use the return sequence and it makes the last layer have only the three classifiers without timestep, but your y_train is defined in that way. But if you set the return sequence to True and plot your model summary, you would see that the last layer will have an output shape of (None, 2, 3) which exactly matches the shape of y_train.

Before understanding what the return_sequence is doing here, you may need to know what timestep means in an LSTM Model, check this answer. AFAIK, it depends on how many timesteps you need to set for your input; I can make a single occurrence of the LSTM cell or multiple times (n-th timestep). And for n-th timestep (n: {1,2,3..N), if I want from LSTM to return all timestep output (n numbers), then I will set return_sequence = True, but else return_sequence = False. From doc,

return_sequences: Boolean. Whether to return the last output. in the output sequence, or the full sequence. Default: False.

In short, if it sets as True, all sequences will return but if it's False, then only the last output will. For example:

inputs = tf.random.normal([32, 8])
inputs = tf.reshape(inputs, [-1, 2, 4 ]) # or [-1, 4, 2] # or [-1, 1, 8]
inputs.shape 
TensorShape([32, 2, 4]) # (batch_size, timestep, input_length)

lstm = tf.keras.layers.LSTM(10, return_sequences=True)
whole_seq_output = lstm(inputs)
print(whole_seq_output.shape)
(32, 2, 10) # (batch_size, timestep, output_length)

lstm = tf.keras.layers.LSTM(10, return_sequences=False)
last_seq_output = lstm(inputs)
print(last_seq_output.shape)
(32, 10) # (batch_size, output_length)

Here is a one-way approach to your above code. Iris data took from here.

import pandas 
dataframe = pandas.read_csv("/content/iris.csv")
dataframe.head(3)

  sepal.length  sepal.width petal.length    petal.width   variety
0   5.1              3.5         1.4             0.2      Setosa
1   4.9              3.0         1.4             0.2      Setosa
2   4.7              3.2         1.3             0.2      Setosa

dataframe.variety.unique()
array(['Setosa', 'Versicolor', 'Virginica'], dtype=object)

target_map = dict(zip(list(dataframe['variety'].unique()), 
                     ([0, 1, 2])))
target_map
{'Setosa': 0, 'Versicolor': 1, 'Virginica': 2}

dataframe['target'] = dataframe.variety.map(target_map) 
dataframe.sample()
    sepal.length    sepal.width petal.length  petal.width   variety   target
128      6.4             2.8       5.6           2.1       Virginica    2

X = dataframe.iloc[:, :4] 
Y = dataframe.iloc[:, 5]

X.shape, Y.shape
((150, 4), (150,))

from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

OHE_Y = to_categorical(Y, num_classes=3)
X_train, X_test, y_train, y_test = train_test_split(X, OHE_Y, 
                                                      test_size=0.2)

X_train.shape
(120, 4)

# make it lstm compatible input 
X_train = X_train.values.reshape(-1, 1, 4)

X_train.shape ,y_train.shape
((120, 1, 4), (120, 3))

Model

from tensorflow.keras import Sequential 
from tensorflow.keras.layers import LSTM, Dense 

def create_nn_model():
  model = Sequential()
  model.add(LSTM(100, dropout=0.2, input_shape=(X_train.shape[1],
                                               X_train.shape[2])))
  model.add(Dense(100, activation='relu'))
  model.add(Dense(3,activation='softmax'))
  model.compile(loss='categorical_crossentropy',
                optimizer='adam', metrics=['accuracy'])
  return model

model = create_nn_model()
model.summary()

model.fit(X_train, y_train, epochs=10,batch_size=5)

...
Epoch 9/10
3ms/step - loss: 0.5224 - accuracy: 0.7243
Epoch 10/10
3ms/step - loss: 0.5568 - accuracy: 0.7833

Inference

model.evaluate(X_train, y_train)
4ms/step - loss: 0.3843 - accuracy: 0.9583
[0.38432881236076355, 0.9583333134651184]

y_pred = model.predict(X_train).argmax(-1)
y_pred
array([2, 1, 1, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0, 1, 1, 1, 0, 1, 0, 0, 2, 0,
       0, 2, 2, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0, 2, 2, 0, 2, 2, 0, 2, 0, 0,
       1, 1, 2, 0, 1, 2, 1, 2, 0, 0, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0, 0, 0,
       2, 2, 0, 2, 1, 0, 2, 1, 0, 0, 0, 1, 1, 1, 0, 2, 2, 1, 1, 0, 2, 0,
       0, 2, 1, 0, 2, 1, 1, 1, 1, 2, 1, 0, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2,
       0, 1, 2, 1, 0, 0, 2, 1, 2, 0])