How to setup a base model in inference mode?

Keras documentation about fine-tuning states that it is important to "keep the BatchNormalization layers in inference mode by passing training=False when calling the base model.". (What is interesting, that every non-official example that I've found about the topic ignores this setting.)

Documentation follows up with example:

from tensorflow import keras
from keras.applications.xception import Xception

base_model = keras.applications.Xception(
    weights='imagenet',  # Load weights pre-trained on ImageNet.
    input_shape=(150, 150, 3),
    include_top=False)  # Do not include the ImageNet classifier at the top.
base_model.trainable = False
inputs = keras.Input(shape=(150, 150, 3))
scale_layer = keras.layers.Rescaling(scale=1 / 127.5, offset=-1)
x = scale_layer(x)

# We make sure that the base_model is running in inference mode here,
# by passing `training=False`. This is important for fine-tuning, as you will
# learn in a few paragraphs.
x = base_model(x, training=False)

x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(1)(x)
model = keras.Model(inputs , outputs)

The thing is that the example is adding preprocessing to the base model and my model(EfficientNetB3) has already preprocessing included and I don't know how to set my base_model with `training=False`` without prepending it with additional layer:

base_model = EfficientNetB3(weights='imagenet', include_top=False, input_shape=input_shape)
base_model.trainable=False
model = Sequential()
model.add(base_model) # How to set base_model training=False?
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.2))
model.add(Dense(10, activation="softmax", name="classifier"))

How to prove that training=False or training=True has an effect:

@Frightera explained to me how to "lock" the model's state and I wanted to prove to myself that the lock happens by checking BatchNormalization non-trainable variables. My understating is that if I call model with training=True then it should update the variables. However, this is not the case, or am I missing something?

import tensorflow as tf
from tensorflow import keras
from keras.applications.efficientnet import EfficientNetB3
import numpy as np


class WrappedEffNet(keras.layers.Layer):
    
    def __init__(self, **kwargs):
        super(WrappedEffNet, self).__init__(**kwargs)
        self.model = EfficientNetB3(weights='imagenet', 
                                                       include_top=False,
                                                       input_shape=(224, 224, 3))
        self.model.trainable=False
    
    def call(self, x, training=False):
        return self.model(x, training=training) # Modified to pass also True.
    

base_model_wrapped = WrappedEffNet()

random_vector = tf.random.uniform((1, 224, 224, 3))

o1 = base_model_wrapped(random_vector)

o2 = base_model_wrapped(random_vector, training = False)

# Getting all non-trainable variable values from all BatchNormalization layers.
array_a = np.array([])
for layer in base_model_wrapped.model.layers:
    if hasattr(layer, 'moving_mean'):
        v = layer.moving_mean.numpy()
        np.concatenate([array_a, v])
        v = layer.moving_variance.numpy()
        np.concatenate([array_a, v])

o3 = base_model_wrapped(random_vector, training = True) # Changing to True, shouldn't this update BatchNormalization non-trainable variables?
array_b = np.array([])
for layer in base_model_wrapped.model.layers:
    if hasattr(layer, 'moving_mean'):
        v = layer.moving_mean.numpy()
        np.concatenate([array_b, v])
        v = layer.moving_variance.numpy()
        np.concatenate([array_b, v])

print(np.allclose(array_a, array_b)) # Shouldn't this be False?

Solution

It is not possible to invoke the call method of the base model in Sequential model as in Functional. However, you can think the model as if it is a custom layer:

class WrappedEffNet(tf.keras.layers.Layer):
    
    def __init__(self, **kwargs):
        super(WrappedEffNet, self).__init__(**kwargs)
        self.model = keras.applications.EfficientNetB3(weights='imagenet', 
                                                       include_top=False,
                                                       input_shape=(224, 224, 3))
        self.model.trainable=False
    
    def call(self, x, training):
        return self.model(x, training=False)

Sanity check:

base_model_wrapped = WrappedEffNet()

random_vector = tf.random.uniform((1, 224, 224, 3))

o1 = base_model_wrapped(random_vector)
o2 = base_model_wrapped(random_vector, training = False)
o3 = base_model_wrapped(random_vector, training = True)

np.allclose(o1, o2), np.allclose(o1, o3), np.allclose(o2, o3)
# (True, True, True)

It is inference mode regardless of the value of training.

Model summary is the same as Sequential:

 Layer (type)                Output Shape              Param #   
=================================================================
 wrapped_eff_net (WrappedEff  (1, 7, 7, 1536)          10783535  
 Net)                                                            
                                                                 
 global_average_pooling2d (G  (1, 1536)                0         
 lobalAveragePooling2D)                                          
                                                                 
 dropout (Dropout)           (1, 1536)                 0         
                                                                 
 classifier (Dense)          (1, 10)                   15370     
                                                                 
=================================================================
Total params: 10,798,905
Trainable params: 15,370
Non-trainable params: 10,783,535
_________________________________________________________________

Edit: In order to see difference of BatchNormalization:

import tensorflow as tf
import numpy as np

x = np.random.randn(1, 2) * 20 + 0.1

bn = tf.keras.layers.BatchNormalization()
input_layer = tf.keras.layers.Input((x.shape[-1], ))
output = bn(input_layer )

model = tf.keras.Model(inputs=input_layer , outputs=output)

model.trainable = False:

model.trainable = False
for i in range(2):
    print('Input:', x)
    print('Moving mean:', model.layers[1].moving_mean.numpy())
    print('training = True -->', model(x, training = True).numpy())
    print('training = False -->', model(x, training = False).numpy())
    print()

Input: [[ 2.50317905 12.44406219]]
Moving mean: [0. 0.]
training = True --> [[ 2.5019286 12.437845 ]]
training = False --> [[ 2.5019286 12.437845 ]]

Input: [[ 2.50317905 12.44406219]]
Moving mean: [0. 0.]
training = True --> [[ 2.5019286 12.437845 ]]
training = False --> [[ 2.5019286 12.437845 ]]

model.trainable = True, training = True:

model.trainable = True
for i in range(2):
    print('Input:', x)
    print('Moving mean:', model.layers[1].moving_mean.numpy())
    print('training = True -->', model(x, training = True).numpy())
    print()

Input: [[ 2.50317905 12.44406219]]
Moving mean: [0. 0.]
training = True --> [[0. 0.]]

Input: [[ 2.50317905 12.44406219]]
Moving mean: [0.02503179 0.12444062]
training = True --> [[0. 0.]]

model.trainable = True, training = False:

model.trainable = True
for i in range(2):
    print('Input:', x)
    print('Moving mean:', model.layers[1].moving_mean.numpy())
    print('training = False -->', model(x, training = False).numpy())
    print()

Input: [[ 2.50317905 12.44406219]]
Moving mean: [0.04981326 0.24763682]
training = False --> [[ 2.476884 12.313342]]

Input: [[ 2.50317905 12.44406219]]
Moving mean: [0.04981326 0.24763682]
training = False --> [[ 2.476884 12.313342]]