Loss function for a Siamese neural network

I'm trying to train Siamese neural networks for face recognition. Many resources use this function as a loss function:

def contrastive_loss(y_true, y_pred):
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

I train several neural networks of different architecture. And for some of them, this function does not work correctly (return nan). Because of this, the neural network is not trained at all.

My code:
from keras.models import Sequential, Model
from keras.layers import Input, Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Lambda, BatchNormalization, Activation
from keras.optimizers import RMSprop
from keras import backend as K

def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def accuracy(y_true, y_pred):
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))

def TestModel(input_shape):
    model = Sequential()
    model.add(Conv2D(filters=96, kernel_size=3, strides=3, activation='relu', input_shape=input_shape, padding='valid'))
    model.add(Conv2D(filters=256, kernel_size=3, strides=3, activation='relu', padding='valid'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(128, activation='relu'))
    return model

def Net_Definition(input_shape):
    model = Sequential()
    model.add(Conv2D(filters=96, kernel_size=7, strides=4, activation='relu', padding='valid', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=3, strides=2, padding='valid'))
    model.add(Conv2D(filters=256, kernel_size=5, strides=1, activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=3, strides=2, padding='valid'))
    model.add(Conv2D(filters=384, kernel_size=3, strides=1, activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=3, strides=2, padding='valid'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(128, activation='softmax'))
    return model

def CreateModel(name, input_shape):
    global network
    if name == 'test':
        network = TestModel(input_shape)
    elif name == 'net_definition':
        network = Net_Definition(input_shape)
        print('Invalid model name!')

    network = Net_Definition(input_shape)

    input_a = Input(shape=input_shape)
    input_b = Input(shape=input_shape)
    processed_a = network(input_a)
    processed_b = network(input_b)

    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    model = Model(inputs=[input_a, input_b], outputs=distance)

    opt = RMSprop()
    model.compile(loss=contrastive_loss, optimizer=opt, metrics=[accuracy])
    return model
from keras.utils import Sequence
import numpy as np
import Models
from keras.callbacks import CSVLogger

class MyGenerator(Sequence):
    def __init__(self, filenames, labels, batch_size):
        self.filenames = filenames
        self.labels = labels
        self.batch_size = batch_size

    def __len__(self):
        return (np.ceil(len(self.filenames) / float(self.batch_size))).astype(np.int32)

    def __getitem__(self, item):
        batch_x = self.filenames[item * self.batch_size:(item + 1) * self.batch_size]
        batch_y = self.labels[item * self.batch_size:(item + 1) * self.batch_size]
        x1 = []
        x2 = []
        for i, files in enumerate(batch_x):
            pair = np.load(files).astype(np.float32)
        x1 = np.asarray(x1)
        x2 = np.asarray(x2)
        return (x1, x2), np.array(batch_y).astype(np.float32)

# path_to_folder = 'Datasets/test/pairs/224/'
path_to_folder = 'Datasets/6. Pairs/224/'
input_shape = (224, 224, 3)
batch_size = 128

x_train_file = open(path_to_folder + 'X_Train.txt', 'r')
y_train_file = open(path_to_folder + 'Y_Train.txt', 'r')
x_val_file = open(path_to_folder + 'X_Val.txt', 'r')
y_val_file = open(path_to_folder + 'Y_Val.txt', 'r')





csv_logger = CSVLogger('logs.log')

train_generator = MyGenerator(x_train, y_train, batch_size)
val_generator = MyGenerator(x_val, y_val, batch_size)

model = Models.CreateModel('test', input_shape)
history =, epochs=10, verbose=1, validation_data=val_generator, callbacks=[csv_logger])

For TestModel everything works fine, but for Net_definition it returns nan. TestModel Net_definition How can the problem be solved? Maybe there are other loss functions for this?


  • I can see a couple of errors here -

    1. y_true and 1-y_true terms in contrastive function should be exchanged.

    You can draw inspiration from here -

    def loss(margin=1):
       """Provides 'constrastive_loss' an enclosing scope with variable 'margin'.
           margin: Integer, defines the baseline for distance for which pairs
                   should be classified as dissimilar. - (default is 1).
           'constrastive_loss' function with data ('margin') attached.
       # Contrastive loss = mean( (1-true_value) * square(prediction) +
       #                         true_value * square( max(margin-prediction, 0) ))
       def contrastive_loss(y_true, y_pred):
           """Calculates the constrastive loss.
               y_true: List of labels, each label is of type float32.
               y_pred: List of predictions of same length as of y_true,
                       each label is of type float32.
               A tensor containing constrastive loss as floating point value.
           square_pred = tf.math.square(y_pred)
           margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
           return tf.math.reduce_mean(
               (1 - y_true) * square_pred + (y_true) * margin_square
       return contrastive_loss


    1. Output for the Siamese network in this example should be probability(value between 0 and 1) because y_true is either 0 or 1. In this case CreateModel function is creating Siamese network and the output is euclidean_distance between two vectors which is not a probability. Euclidean distance can be greater than 1. Better to add activation like sigmoid in the final layer of Siamese model.