TF2.3 - More model outputs than targets

I am trying to write a model in which there are three outputs, the latter two of which are to be trained with respect to targets present in the dataset, the former should just be a non-trainable output.

First, defining a dataset:

from typing import Tuple

import numpy as np
import tensorflow as tf

def ds_fn(in_shape: Tuple[int],
          out_shape: Tuple[int],
          dtype: tf.DType = tf.float32) -> tf.data.Dataset:
  # Generator function.
  def gen() -> Tuple[Tuple[np.array], Tuple[np.array]]:
    for _ in range(1000):
      # Inputs.
      x0 = np.ones(in_shape, dtype=np.float32)
      x1 = 2 * x0

      # Outputs.
      y0 = np.ones(out_shape, dtype=np.float32)
      y1 = 2 * np.ones_like(y0)
      y2 = 3 * np.ones_like(y1)

      # Targets correspond to outputs 1 and 2 of the network. Output 0 has
      # no target.
      yield (x0, x1), (y1, y2)

  return tf.data.Dataset.from_generator(gen,
                                        output_types=((dtype,) * 2, # Input
                                                      (dtype,) * 2), # Output
                                        output_shapes=((in_shape,) * 2, # Input
                                                       (out_shape,) * 2)) # Output

In the above, the dataset has two targets, corresponding to the last two outputs of the model defined below.

from tensorflow import keras

class ExampleModel(keras.Model):
  def __init__(self, out_dim: int):
    super().__init__()

    self.dense_a = keras.layers.Dense(out_dim)
    self.dense_b = keras.layers.Dense(out_dim)

  def call(self, inputs, training=False):
    a, b = inputs

    x0 = self.dense_a(a)
    x1 = self.dense_b(b)

    x = (x0 + x1) / 2

    return (x, # Output 0 - should not be trained.
            2 * x,
            3 * x)

From reading the Keras documentation, to handle this case where there are a greater number of model outputs than there are targets in the dataset (with the surplus outputs considered to be non-trainable), it appears that the Model.compile arg loss_weights should handle the matching between targets and losses. More concretely, given the following.

def model_fn(out_dim: int) -> ExampleModel:

  m = ExampleModel(out_dim)

  losses = [
    None, # Output 0 - should not be trainable.
    'mse',
    'mse'
  ]

  loss_weights = [
    0, # Output 0 - should not be trainable.
    1,
    1
  ]

  m.compile(loss=losses,
            loss_weights=loss_weights,
            optimizer='sgd')

  return m

I would expect that Keras would disregard the first model output when computing the loss, given the None loss provided and the 0 loss weight, however I am seeing the following error.

ValueError: The two structures don't have the same sequence length. Input structure has length 2, while shallow structure has length 3.

Which seems to indicate that this is not the case when run as follows.

if __name__ == "__main__":
  bs = 16
  in_dim = 4
  out_dim = 8
  epochs = 10

  ds = ds_fn((bs, 1, in_dim), (bs, 1, out_dim))
  ds = ds.repeat(epochs)

  m = model_fn(out_dim)

  m.fit(ds,
        epochs=epochs,
        batch_size=bs)

If I provide an additional target in data_fn and gen, combined with a dummy loss (lambda x, y: 0.0. for example), then training commences. However, this will not scale to a non toy problem with potentially large outputs and targets (images, for example).

If I instead return a dict from the model call method and provide dict for losses and loss_weights (both with keys matching that returned from call), there is no change (I thought that the explicit output naming might allow Keras to match outputs, losses and targets).

Am I misunderstanding the intended purpose of lists as losses (in which None is allowed) and loss_weights?

Solution

One of the way could be to create a dictionary mapping dataset and model and later use the key of that dictionary to control the loss, metrics (and relevant) parameters. Here is how it can be done.

Data

def ds_fn(
    in_shape: Tuple[int],
    out_shape: Tuple[int],
    dtype: tf.DType = tf.float32
) -> tf.data.Dataset:
  # Generator function.
    def gen() -> Tuple[Tuple[np.array], Tuple[np.array]]:
        for _ in range(1000):
            # Inputs.
            x0 = np.ones(in_shape, dtype=np.float32)
            x1 = 2 * x0

            # Outputs.
            y0 = np.ones(out_shape, dtype=np.float32)
            y1 = 2 * np.ones_like(y0)
            y2 = 3 * np.ones_like(y1)

            # Targets correspond to outputs 1 and 2 of the network. Output 0 has
            # no target.
            yield {'input_a': x0, 'input_b': x1}, {'output_a': y1, 'output_b': y2}

    return tf.data.Dataset.from_generator(
        gen,
        output_types=(
            {'input_a': dtype, 'input_b': dtype},
            {'output_a': dtype, 'output_b': dtype}
        ),
        output_shapes=(
            {'input_a': in_shape, 'input_b': in_shape},
            {'output_a': out_shape, 'output_b': out_shape}
        )
    )

Model

class ExampleModel(keras.Model):
    def __init__(self, out_dim: int):
        super().__init__()
        self.dense_a = keras.layers.Dense(out_dim)
        self.dense_b = keras.layers.Dense(out_dim)

    def call(self, inputs, training=False):
        x0 = self.dense_a(inputs['input_a'])
        x1 = self.dense_b(inputs['input_b'])
        x = (x0 + x1) / 2
        return {
            "output_a": x, 
            "output_b": 2 * x,
            "output_c": 3 * x
        }

Compile

def model_fn(out_dim: int) -> ExampleModel:
    m = ExampleModel(out_dim)

    losses = {
        "output_b": 'mse',
        "output_c": 'mse'
    }

    loss_weights = {
        "output_a": 1,
        "output_b": 1
    }

    m.compile(
        loss=losses,
        loss_weights=loss_weights,
        optimizer='sgd'
    )
    
    return m

Run

m = model_fn(out_dim)

m.fit(
    ds,
    epochs=epochs,
    batch_size=bs
)

loss: 3.2685e-13 - output_b_loss: 3.2685e-13 - output_c_loss: 0.0000e+00

Additional Resource