Search code examples

Problem completing BERT model for sentiment classification

I am trying to figure out sentiment classification on movie reviews using BERT, transformers and tensorflow. This is the code I currently have:

def read_dataset(filename, model_name="bert-base-uncased"):
    """Reads a dataset from the specified path and returns sentences and labels"""

    tokenizer = BertTokenizer.from_pretrained(model_name)
    with open(filename, "r", encoding="utf-8") as f:
        lines = f.readlines()
        # preallocate memory for the data
        sents, labels = list(), np.empty((len(lines), 1), dtype=int)

        for i, line in enumerate(lines):
            text, str_label, _ = line.split("\t")
            labels[i] = int(str_label.split("=")[1] == "POS")
    return dict(tokenizer(sents, padding=True, truncation=True, return_tensors="tf")), labels

class BertMLP(tf.keras.Model):
    def __init__(self, embed_batch_size=100, model_name="bert-base-cased"):
        super(BertMLP, self).__init__() = embed_batch_size
        self.model = TFBertModel.from_pretrained(model_name)
        self.classification_head = tf.keras.models.Sequential(
            layers = [
                tf.keras.layers.Dense(350, activation="tanh"),
                tf.keras.layers.Dense(200, activation="tanh"),
                tf.keras.layers.Dense(50, activation="tanh"),
                tf.keras.layers.Dense(1, activation="sigmoid", use_bias=False)

    def call(self, inputs):
        outputs = self.model(inputs)
        return outputs

def evaluate(model, inputs, labels, loss_func):
    mean_loss = tf.keras.metrics.Mean(name="train_loss")
    accuracy = tf.keras.metrics.BinaryAccuracy(name="train_accuracy")

    predictions = model(inputs)
    mean_loss(loss_func(labels, predictions))
    accuracy(labels, predictions)

    return mean_loss.result(), accuracy.result() * 100

if __name__ == "__main__":
    train = read_dataset("datasets/rt-polarity.train.vecs")
    dev = read_dataset("datasets/")
    test = read_dataset("datasets/rt-polarity.test.vecs")

    mlp = BertMLP()
    mlp.compile(tf.keras.optimizers.SGD(learning_rate=0.01), loss='mse')
    dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
    print("Before training:", f"Dev Loss: {dev_loss}, Dev Acc: {dev_acc}")*train, epochs=10, batch_size=10)
    dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
    print("After training:", f"Dev Loss: {dev_loss}, Dev Acc: {dev_acc}")

However, when I run this code, I get an error:

Traceback (most recent call last):

  File "C:\Users\home\anaconda3\lib\site-packages\spyder_kernels\", line 356, in compat_exec
    exec(code, globals, locals)

  File "c:\users\home\downloads\", line 60, in <module>
    dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())

  File "c:\users\home\downloads\", line 46, in evaluate
    predictions = model(inputs)

  File "C:\Users\home\anaconda3\lib\site-packages\keras\utils\", line 67, in error_handler
    raise e.with_traceback(filtered_tb) from None

  File "c:\users\home\downloads\", line 39, in call
    outputs = self.model(inputs)

  File "C:\Users\home\anaconda3\lib\site-packages\transformers\", line 409, in run_call_with_unpacked_inputs
    return func(self, **unpacked_inputs)

  File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\", line 1108, in call
    outputs = self.bert(

  File "C:\Users\home\anaconda3\lib\site-packages\transformers\", line 409, in run_call_with_unpacked_inputs
    return func(self, **unpacked_inputs)

  File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\", line 781, in call
    embedding_output = self.embeddings(

  File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\", line 203, in call
    inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

InvalidArgumentError: Exception encountered when calling layer "embeddings" (type TFBertEmbeddings).

indices[1174,8] = 29550 is not in [0, 28996) [Op:ResourceGather]

Call arguments received:
  • input_ids=tf.Tensor(shape=(1599, 73), dtype=int32)
  • position_ids=None
  • token_type_ids=tf.Tensor(shape=(1599, 73), dtype=int32)
  • inputs_embeds=None
  • past_key_values_length=0
  • training=False

I googled for a while, and I can't find anything conclusive. I am pretty sure it has something to do with this part:

def call(self, inputs):
        outputs = self.model(inputs)
        return outputs

But again, I have tried a lot of different things, including limiting dataset size and installing different versions of transformers and tensorflow, but to no avail. Please let me know what I'm doing wrong. Thank you!


  • OP was using bert-base-cased for their model, and bert-base-uncased for their tokenizer, causing issues during training when the vocab size of the model and the tokenized data differed.