I need to build a transformer-based architecture in Tensorflow following the encoder-decoder approach where the encoder is a preexisting Huggingface Distilbert model and the decoder is a CNN.
Inputs: a text containing texts with several phrases in a row. Outputs: codes according to taxonomic criteria. My data file has 7387 pairs text-label in TSV format:
text \t code
This is example text number one. It might contain some other phrases. \t C21
This is example text number two. It might contain some other phrases. \t J45.1
This is example text number three. It might contain some other phrases. \t A27
The remainder of the code is this:
text_file = "data/datafile.tsv"
with open(text_file) as f:
lines = f.read().split("\n")[:-1]
text_and_code_pairs = []
for line in lines:
text, code = line.split("\t")
text_and_code_pairs.append((text, code))
random.shuffle(text_and_code_pairs)
num_val_samples = int(0.10 * len(text_and_code_pairs))
num_train_samples = len(text_and_code_pairs) - 3 * num_val_samples
train_pairs = text_and_code_pairs[:num_train_samples]
val_pairs = text_and_code_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_and_code_pairs[num_train_samples + num_val_samples :]
train_texts = [fst for (fst,snd) in train_pairs]
train_labels = [snd for (fst,snd) in train_pairs]
val_texts = [fst for (fst,snd) in val_pairs]
val_labels = [snd for (fst,snd) in val_pairs]
test_texts = [fst for (fst,snd) in test_pairs]
test_labels = [snd for (fst,snd) in test_pairs]
distilbert_encoder = TFDistilBertModel.from_pretrained("distilbert-base-multilingual-cased")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
train_dataset = tf.data.Dataset.from_tensor_slices((
dict(train_encodings),
train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
dict(val_encodings),
val_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
dict(test_encodings),
test_labels
))
model = build_model(distilbert_encoder)
model.fit(train_dataset.batch(64), validation_data=val_dataset, epochs=3, batch_size=64)
model.predict(test_dataset, verbose=1)
Lastly, the build_model
function:
def build_model(transformer, max_len=512):
model = tf.keras.models.Sequential()
# Encoder
inputs = layers.Input(shape=(max_len,), dtype=tf.int32)
distilbert = transformer(inputs)
# LAYER - something missing here?
# Decoder
conv1D = tf.keras.layers.Conv1D(filters=5, kernel_size=10)(distilbert)
pooling = tf.keras.layers.MaxPooling1D(pool_size=2)(conv1D)
flat = tf.keras.layers.Flatten()(pooling)
fc = tf.keras.layers.Dense(1255, activation='relu')(flat)
softmax = tf.keras.layers.Dense(1255, activation='softmax')(fc)
model = tf.keras.models.Model(inputs = inputs, outputs = softmax)
model.compile(tf.keras.optimizers.Adam(learning_rate=5e-5), loss="categorical_crossentropy", metrics=['accuracy'])
print(model.summary())
return model
I managed to narrow down the possible locations of my problem. After changing from sequential to functional Keras API, I get the following error:
Traceback (most recent call last):
File "keras_transformer.py", line 99, in <module>
main()
File "keras_transformer.py", line 94, in main
model = build_model(distilbert_encoder)
File "keras_transformer.py", line 23, in build_model
conv1D = tf.keras.layers.Conv1D(filters=5, kernel_size=10)(distilbert)
File "/home/users/user/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py", line 897, in __call__
self._maybe_build(inputs)
File "/home/users/user/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py", line 2416, in _maybe_build
self.build(input_shapes) # pylint:disable=not-callable
File "/home/users/user/.local/lib/python3.6/site-packages/tensorflow/python/keras/layers/convolutional.py", line 152, in build
input_shape = tensor_shape.TensorShape(input_shape)
File "/home/users/user/.local/lib/python3.6/site-packages/tensorflow/python/framework/tensor_shape.py", line 771, in __init__
self._dims = [as_dimension(d) for d in dims_iter]
File "/home/users/user/.local/lib/python3.6/site-packages/tensorflow/python/framework/tensor_shape.py", line 771, in <listcomp>
self._dims = [as_dimension(d) for d in dims_iter]
File "/home/users/user/.local/lib/python3.6/site-packages/tensorflow/python/framework/tensor_shape.py", line 716, in as_dimension
return Dimension(value)
File "/home/users/user/.local/lib/python3.6/site-packages/tensorflow/python/framework/tensor_shape.py", line 200, in __init__
None)
File "<string>", line 3, in raise_from
TypeError: Dimension value must be integer or None or have an __index__ method, got 'last_hidden_state'
It seems that the error lies in the connection between the output of the transformer and the input of the convolutional layer. Am I supposed to include another layer between them so as to adapt the output of the transformer? If so, what would be the best option?I'm using tensorflow==2.2.0, transformers==4.5.1 and Python 3.6.9
I think the problem is to call the right tensor for the tensorflow layer after the dilbert
instance. Because distilbert = transformer(inputs)
returns an instance rather than a tensor like in tensorflow
, e.g., pooling = tf.keras.layers.MaxPooling1D(pool_size=2)(conv1D)
. pooling
is the output tensor of the MaxPooling1D
layer.
I fix your problem by calling the last_hidden_state
variable of the distilbert
instance (i.e. output of the dilbert
model), and this will be your input to the next Conv1D
layer.
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # suppress Tensorflow messages
from transformers import TFDistilBertModel, DistilBertModel
import tensorflow as tf
distilbert_encoder = TFDistilBertModel.from_pretrained("distilbert-base-multilingual-cased")
def build_model(transformer, max_len=512):
# model = tf.keras.models.Sequential()
# Encoder
inputs = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32)
distilbert = transformer(inputs)
# Decoder
###### !!!!!! #########
conv1D = tf.keras.layers.Conv1D(filters=5, kernel_size=10)(distilbert.last_hidden_state)
###### !!!!!! #########
pooling = tf.keras.layers.MaxPooling1D(pool_size=2)(conv1D)
flat = tf.keras.layers.Flatten()(pooling)
fc = tf.keras.layers.Dense(1255, activation='relu')(flat)
softmax = tf.keras.layers.Dense(1255, activation='softmax')(fc)
model = tf.keras.models.Model(inputs = inputs, outputs = softmax)
model.compile(tf.keras.optimizers.Adam(learning_rate=5e-5), loss="categorical_crossentropy", metrics=['accuracy'])
print(model.summary())
return model
model = build_model(distilbert_encoder)
This returns,
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 512)] 0
_________________________________________________________________
tf_distil_bert_model (TFDist TFBaseModelOutput(last_hi 134734080
_________________________________________________________________
conv1d (Conv1D) (None, 503, 5) 38405
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 251, 5) 0
_________________________________________________________________
flatten (Flatten) (None, 1255) 0
_________________________________________________________________
dense (Dense) (None, 1255) 1576280
_________________________________________________________________
dense_1 (Dense) (None, 1255) 1576280
=================================================================
Total params: 137,925,045
Trainable params: 137,925,045
Non-trainable params: 0
Note: I assume you mean tf.keras.layers.Input
by layers.Input
in your build_model
function.