please I'm trying to build an NLP classifier on top of BERT but I'm struggling with data imbalance. I'm looking for an implementation of weighted CategoricalCrossEntropy. I've already seen a solution using class_weight parameter on fit function but it doesn't "fit" well with my data (I've one hot encoded them and it actually throws an error cause dict element are not matching.

Can someone please give me an implementation from scratch of a WeightedCategoricalCrossEntropy function allowing me me to add weights manually to Tensorflow's native CategoricalCrossEntropy.


  • The __call__ method of tf.losses.CategoricalCrossentropy accepts three arguments:


    And the sample_weight acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If sample_weight is a tensor of size [batch_size], then the total loss for each sample of the batch is rescaled by the corresponding element in the sample_weight vector. You can use it as such:

    def compute_loss(model, x, y, training):
      out = model(inputs=x, training=training)
      sample_weight = tf.random.uniform((tf.shape(x)[0], 1),
      loss = loss_object(y_true=y, y_pred=out,
      return loss

    These are random values but you can change the values depending on y so it becomes a class weight rather than a sample weight. Here's a full example of a running training loop with custom sample weights:

    import os
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    import tensorflow as tf
    from tensorflow import keras as K
    from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPooling2D, Dropout
    from tensorflow import nn as nn
    from functools import partial
    (xtrain, ytrain), (xtest, ytest) = tf.keras.datasets.mnist.load_data()
    train =, ytrain))
    test =, ytest))
    def prepare(inputs, outputs):
        inputs = tf.divide(x=inputs, y=255)
        inputs = tf.expand_dims(inputs, -1)
        targets = tf.one_hot(indices=outputs, depth=10)
        return inputs, targets
    train = train.take(5_000).batch(4).map(prepare)
    test = test.take(1_000).batch(4).map(prepare)
    class MyCNN(K.Model):
        def __init__(self):
            super(MyCNN, self).__init__()
            Conv = partial(Conv2D, kernel_size=(3, 3), activation=nn.relu)
            MaxPool = partial(MaxPooling2D, pool_size=(2, 2))
            self.conv1 = Conv(filters=8)
            self.maxp1 = MaxPool()
            self.conv2 = Conv(filters=16)
            self.maxp2 = MaxPool()
            self.conv3 = Conv(filters=32)
            self.maxp3 = MaxPool()
            self.flatt = Flatten()
            self.dens1 = Dense(64, activation=nn.relu)
            self.drop1 = Dropout(.5)
            self.dens2 = Dense(10, activation=nn.softmax)
        def call(self, x, training=None, **kwargs):
            x = self.conv1(x)
            x = self.maxp1(x)
            x = self.conv2(x)
            x = self.maxp2(x)
            x = self.conv3(x)
            x = self.maxp3(x)
            x = self.flatt(x)
            x = self.dens1(x)
            x = self.drop1(x)
            x = self.dens2(x)
            return x
    model = MyCNN()
    loss_object = tf.losses.CategoricalCrossentropy()
    def compute_loss(model, x, y, training):
      out = model(inputs=x, training=training)
      sample_weight = tf.random.uniform((tf.shape(x)[0], 1),
      loss = loss_object(y_true=y, y_pred=out, sample_weight=sample_weight)
      return loss
    def get_grad(model, x, y):
        with tf.GradientTape() as tape:
            loss = compute_loss(model, x, y, training=False)
        return loss, tape.gradient(loss, model.trainable_variables)
    optimizer = tf.optimizers.Adam()
    verbose = "Epoch {:2d}" \
              " Loss: {:.3f} TLoss: {:.3f} Acc: {:.3%} TAcc: {:.3%}"
    for epoch in range(1, 10 + 1):
        train_loss = tf.metrics.Mean()
        train_acc = tf.metrics.CategoricalAccuracy()
        test_loss = tf.metrics.Mean()
        test_acc = tf.metrics.CategoricalAccuracy()
        for x, y in train:
            loss_value, grads = get_grad(model, x, y)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            train_acc.update_state(y, model(x, training=True))
        for x, y in test:
            loss_value, _ = get_grad(model, x, y)
            test_acc.update_state(y, model(x, training=False))