TimeSeries use case : How to plug an LSTM network (predictor) on top of a VAE network (denoiser)

I've been struggling for while coding the network in the picture below.

The use case is dedicated for time series:

First the VAE denoises the sequences of the timeseries, by learning the most common features.
And Second, the LSTM predicts next value of the sequences with which the VAE was also trained.

I'm struggling with building such a network, keeping two loss functions:

The VAE must keep its classical loss function: MSE + KL
The LSTM part must keep its MSE loss function.

From a dataset of 1677 time series, with 61440 time status each. I've downsampled all the timeseries with a rolling mean (to mitigate the 61440 features), and reshaped them with a sliding windows of 200 length, which gave me an InputShape of (989300, 1, 200), which is the shape of the samples entering the VAE (989300 sequences). The output of the network is the next time status of my sequence. For example, given a sequence of 200 length, the LSTM regressor part predicts the 201st status, ie the value coming right after this sequence.

My shapes (Xtrain, Xtest, ytrain, ytest):

((989300, 1, 200), (286897, 1, 200), (989300,), (286897,))

Here is my code. I know that it might be not that clean, I'm trying to make it work first.

My imports

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Lambda, TimeDistributed, Input, RepeatVector, LSTM

My loss function, combining VAE loss and LSTM loss, with a lambda parameter:

def vae_loss2_(input_x, decoder1, y_pred, z_log_sigma, z_mean, lambd):
    """ Calculate loss = reconstruction loss + KL loss for each data in minibatch """
    recon = K.sum(K.binary_crossentropy(input_x, decoder1))
    # D_KL(Q(z|X) || P(z|X)); calculate in closed form as both dist. are Gaussian
    kl = 0.5 * K.sum(K.exp(z_log_sigma) + K.square(z_mean) - 1. - z_log_sigma)
    
    lstm = tf.keras.losses.MSE(decoder1, y_pred)
    
    return (recon + kl) + lambd*lstm

My sampling function, with which the VAE samples from the latent space:

def sampling(args):
    z_mean, z_log_sigma = args
    latent_dim = 1
    batch_size = K.shape(z_mean)[0]
    epsilon = K.random_normal(shape=(batch_size, K.shape(z_mean)[1], latent_dim), mean=0., stddev=1.)
    return z_mean + z_log_sigma * epsilon

And finally, here all my code, with both networks:

latent_dim = 1
timesteps, features = 1, 200


# timesteps, features
input_x = Input(shape= (timesteps, features))

#Encoder
h1 = Dense(150, activation='relu', kernel_initializer='random_normal', bias_initializer='random_normal')(input_x)
h1 = Dense(100, activation='relu', kernel_initializer='random_normal', bias_initializer='random_normal')(h1)
h1 = Dense(50, activation='relu', kernel_initializer='random_normal', bias_initializer='random_normal')(h1)
h1 = Dense(20, activation='relu', kernel_initializer='random_normal', bias_initializer='random_normal')(h1)

#z_layer
z_mean = Dense(latent_dim)(h1)
z_log_sigma = Dense(latent_dim)(h1)

z = Lambda(sampling)([z_mean, z_log_sigma])

#Decoder
decoder1 = Dense(20, activation='relu', kernel_initializer='random_normal', bias_initializer='random_normal')(z)
decoder1 = Dense(50, activation='relu', kernel_initializer='random_normal', bias_initializer='random_normal')(decoder1)
decoder1 = Dense(100, activation='relu', kernel_initializer='random_normal', bias_initializer='random_normal')(decoder1)
decoder1 = Dense(150, activation='relu', kernel_initializer='random_normal', bias_initializer='random_normal')(decoder1)
decoder1 = TimeDistributed(Dense(features))(decoder1)

# LSTM network

lstm1 = LSTM(150, activation='relu', kernel_initializer='random_normal', bias_initializer='random_normal', return_sequences=True)(decoder1)
lstm1 = Dense(1)(lstm1)

finalModel = Model(input_x, lstm1)

finalModel.add_loss(vae_loss2_(input_x, decoder1, lstm1, z_log_sigma, z_mean, 0.2))

finalModel.compile(loss=None, optimizer='adam')


history = finalModel.fit(Xtrain_, ytrain_, epochs=70, batch_size = 2500, validation_data = (Xtest_,ytest_))

Executing, this code, raises the following error, as the fit step doesn't expect ytrain and ytest, for the next timestamp prediction:


WARNING:tensorflow:Output dense_593 missing from loss dictionary. We assume this was done on purpose. The fit and evaluate APIs will not be expecting any data to be passed to dense_593.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-207-47c2c28976f0> in <module>
     43 #a = np.load('atrain.npy')
     44 
---> 45 history = finalModel.fit(Xtrain_, ytrain_, epochs=70, batch_size = 2500, validation_data = (Xtest_,ytest_))

/opt/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    726         max_queue_size=max_queue_size,
    727         workers=workers,
--> 728         use_multiprocessing=use_multiprocessing)
    729 
    730   def evaluate(self,

/opt/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
    222           validation_data=validation_data,
    223           validation_steps=validation_steps,
--> 224           distribution_strategy=strategy)
    225 
    226       total_samples = _get_total_number_of_samples(training_data_adapter)

/opt/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing)
    545         max_queue_size=max_queue_size,
    546         workers=workers,
--> 547         use_multiprocessing=use_multiprocessing)
    548     val_adapter = None
    549     if validation_data:

/opt/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, shuffle, steps, distribution_strategy, max_queue_size, workers, use_multiprocessing)
    592         batch_size=batch_size,
    593         check_steps=False,
--> 594         steps=steps)
    595   adapter = adapter_cls(
    596       x,

/opt/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in _standardize_user_data(self, x, y, sample_weight, class_weight, batch_size, check_steps, steps_name, steps, validation_split, shuffle, extract_tensors_from_dataset)
   2517           shapes=None,
   2518           check_batch_axis=False,  # Don't enforce the batch size.
-> 2519           exception_prefix='target')
   2520 
   2521       # Generate sample-wise weight values given the `sample_weight` and

/opt/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_utils.py in standardize_input_data(data, names, shapes, check_batch_axis, exception_prefix)
    487       raise ValueError(
    488           'Error when checking model ' + exception_prefix + ': '
--> 489           'expected no data, but got:', data)
    490     return []
    491   if data is None:

ValueError: ('Error when checking model target: expected no data, but got:', array([0.49538032, 0.55329189, 0.47183994, ..., 0.84650205, 0.89713042,
       0.87897429]))

Thank you very much for your help,

Solution

I solved my issue: with add_loss function, the model fit doesn't expect any ytrain or ytest, as it doesn't have a loss in compile function. Putting ytrain and ytest inside fit method, is compelling a loss function in form of loss_fn(ytrue, ypred): return MSE(ytrue, ypred), like the classic keras.losses.MSE