I want to use Bert language model for training a multi class text classification task. Previously I trained using LSTM without any Error but Bert gives me this Error. I get the following Error and I really don't know how to solve it, can anyone help me please?
Unfortunately there is very little documentation using BERT in keras library.
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
import tensorflow_hub as hub
from bert import tokenization
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
def bert_encode(texts, tokenizer, max_len=512):
all_tokens = []
all_masks = []
all_segments = []
for text in texts:
text = tokenizer.tokenize(text)
text = text[:max_len-2]
input_sequence = ["[CLS]"] + text + ["[SEP]"]
pad_len = max_len - len(input_sequence)
tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
pad_masks = [1] * len(input_sequence) + [0] * pad_len
segment_ids = [0] * max_len
all_tokens.append(tokens)
all_masks.append(pad_masks)
all_segments.append(segment_ids)
return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
def build_model(bert_layer, max_len=512):
input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
clf_output = sequence_output[:, 0, :]
net = tf.keras.layers.Dense(64, activation='softmax')(clf_output)
net = tf.keras.layers.Dropout(0.2)(net)
net = tf.keras.layers.Dense(32, activation='softmax')(net)
net = tf.keras.layers.Dropout(0.2)(net)
out = tf.keras.layers.Dense(3, activation='softmax')(net)
model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
return model
max_len = 150
train_input = bert_encode(data.text_cleaned, tokenizer, max_len=max_len)
Error as following :
UnparsedFlagAccessError Traceback (most recent call last)
<ipython-input-175-fd64df42591d> in <module>()
1 import sys
2 max_len = 150
----> 3 train_input = bert_encode(o.text_cleaned, tokenizer, max_len=max_len)
4 frames
/usr/local/lib/python3.7/dist-packages/absl/flags/_flagvalues.py in __getattr__(self, name)
496 # get too much noise.
497 logging.error(error_message)
--> 498 raise _exceptions.UnparsedFlagAccessError(error_message)
499
500 def __setattr__(self, name, value):
UnparsedFlagAccessError: Trying to access flag --preserve_unused_tokens before flags were parsed.
Based on this issue you have to downgrade bert-tensorflow to 1.0.1. Check this answer to find a solution. If you are following this tutorial downgrade bert-tensorflow and use the !wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
as suggested because inside the python code the author has made the change from tf.gfile.GFile(vocab_file, "r")
to tf.io.gfile.Gfile(vocab_file, "r")
. After that code compiles successfully. Ping me if you want anything else.