python machine-learning keras data-science lstm

How to migrate from keras fit_generator() to fit() properly?

I have 2 datasets and a weight array. (train_X, validation_X, train_Y, validation_Y and sampleW) The X sets are 3 dimensional, while the Y sets are 2 dimensional numpy-arrays. sampleW is a one dimensional numpy array.

How do I successfully migrate from fit_generator() to fit() function?

In terms of:

are " fit(x=None, y=None," for train_X, train_Y?
how to pass validation data seperately? (validation_X, validation_Y)
can I pass sampleW the same way as before?
how to train piecewise data on fit()?
most importantly: how to do this without generator?

this is a minimal reproducable (I am currently struggeling to find out why any other batchsize but 1 gives an error, but >1 should also be usable)

# -*- coding: utf-8 -*-
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,LSTM,BatchNormalization
import tensorflow as tf, numpy as np; from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint 
tensorboard_path= r"C:\Users\user\documents\session"  # <--- your path
checkpoint_path = tensorboard_path 

BATCH_SIZE = 1
EPOCHS, Input_shape, labels =  3, (20,4),6
train_X,train_Y = np.asarray([np.random.random(Input_shape) for x in range(100)]), np.random.random((100,labels))
validation_X,validation_Y = np.asarray([np.random.random(Input_shape) for x in range(50)]), np.random.random((50,labels))
sampleW = np.random.random((100,1)) 

class CustomGenerator_SampleW(tf.keras.utils.Sequence) :
    def __init__(self, list_x, labels, batch_size, sample_weights=None) : 
        self.labels         = labels
        self.batch_size     = batch_size
        self.list_x         = list_x
        self.sample_weights = sample_weights
        
    def __len__(self) :
        return (np.ceil(len(self.list_x) / float(self.batch_size))).astype(np.int)
    def __getitem__(self, idx) :
        batch_x      = self.list_x[idx * self.batch_size : (idx+1) * self.batch_size]
        batch_y      = self.labels[idx * self.batch_size : (idx+1) * self.batch_size]
        batch_weight = self.sample_weights[idx * self.batch_size : (idx+1) * self.batch_size]
        return np.array(batch_x),np.array(batch_y), np.array(batch_weight)

class CustomGenerator(tf.keras.utils.Sequence) :
    def __init__(self, list_x, labels, batch_size) : 
        self.labels         = labels
        self.batch_size     = batch_size
        self.list_x         = list_x 
        
    def __len__(self) :
        return (np.ceil(len(self.list_x) / float(self.batch_size))).astype(np.int)
    def __getitem__(self, idx) :
        batch_x      = self.list_x[idx * self.batch_size : (idx+1) * self.batch_size]
        batch_y      = self.labels[idx * self.batch_size : (idx+1) * self.batch_size] 
        return np.array(batch_x),np.array(batch_y)
 

model = Sequential()
model.add(LSTM(242, input_shape=Input_shape, return_sequences=True))
model.add(Dropout(0.3)); model.add(BatchNormalization())  

model.add(LSTM(242, return_sequences=True))
model.add(Dropout(0.3)); model.add(BatchNormalization())

model.add(Dense(labels, activation='tanh')); model.add(Dropout(0.3))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='mean_absolute_error',optimizer=opt,metrics=['mse'])

if sampleW is not None:
    train_batch_gen   = CustomGenerator_SampleW(train_X, train_Y, BATCH_SIZE, sample_weights=sampleW)
else: train_batch_gen = CustomGenerator(train_X, train_Y, BATCH_SIZE)
validation_batch_gen  = CustomGenerator(validation_X, validation_Y, BATCH_SIZE)

tensorboard = TensorBoard(tensorboard_path)
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') 

model.fit_generator(train_batch_gen, steps_per_epoch=None,  epochs=EPOCHS, 
                    validation_data = validation_batch_gen, callbacks=[tensorboard,checkpoint])

Solution

This is due to the shape mismatch of your model output and labels provided.

Model architecture:

As you can see the output shape of your model is (batch_size, 20, 6) and the shape of your labels is (batch_size, 6) which are not compatible.

Why was this working for batch_size = 1?
This is because TensorFlow was using a technique called broadcasting. For eg:

x = np.ones(shape = (1,20,6))
array([[[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]]])


y = np.ones(shape = (1,6))
array([[1., 1., 1., 1., 1., 1.]])


y-x
array([[[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]]])

See this for more information.

But broadcasting was no longer possible when you used batch_size = 10.

Code:

x = np.ones(shape = (10,20,6))
y = np.ones(shape = (10,6))
y-x

Output:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-102-4a65323a80fa> in <module>
      1 x = np.ones(shape = (10,20,6))
      2 y = np.ones(shape = (10,6))
----> 3 y-x

ValueError: operands could not be broadcast together with shapes (10,6) (10,20,6)

The shape of your model can be fixed by adding a flatten layer after lstm layer to convert a 2d vector into a 1d vector.

Code:

model = Sequential()
model.add(LSTM(242, input_shape=Input_shape, return_sequences=True))
model.add(Dropout(0.3)); model.add(BatchNormalization())  

model.add(LSTM(242, return_sequences=True))
model.add(Dropout(0.3)); model.add(BatchNormalization())
model.add(Flatten())
model.add(Dropout(0.3))
model.add(Dense(labels, activation='tanh')) 

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='mean_absolute_error',optimizer=opt,metrics=['mse'])
tf.keras.utils.plot_model(model, 'my_first_model.png', show_shapes=True)

Model architecture:

Finally using model.fit() :

model.fit(train_batch_gen, epochs=EPOCHS, validation_data = validation_batch_gen)

Output:

Epoch 1/3
2/2 [==============================] - 1s 708ms/step - loss: 0.2891 - mse: 0.5739 - val_loss: 0.4078 - val_mse: 0.2461
Epoch 2/3
2/2 [==============================] - 0s 46ms/step - loss: 0.2229 - mse: 0.3151 - val_loss: 0.3867 - val_mse: 0.2225
Epoch 3/3
2/2 [==============================] - 0s 49ms/step - loss: 0.2315 - mse: 0.3341 - val_loss: 0.3813 - val_mse: 0.2161