python tensorflow keras text-processing word-embedding

tensorflow2.x keras Embedding layer process tf.dataset error

This question is a follow-up of tensorflow 2 TextVectorization process tensor and dataset error

I would like to make do a word embedding for the processed text with tnesorflow 2.8 on Jupyter.

def standardize(input_data):

    input_data = tf.strings.lower(input_data)
    input_data = tf.strings.regex_replace(input_data, f"[{re.escape(string.punctuation)}]", " ")
    return input_data

# the input data loaded from text files by TfRecordDataset(file_paths, "GZIP")
# each file can be 200+MB, totally about 300 files
# each file hold the data with multiple columns
# some columns are text
# after loading, the dataset will be accessed by column name 
# e.g. one column is "sports", so the input_dataset["sports"] 
# return a tensor, which is like the following example

input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)

text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )

dataset = tf.data.Dataset.from_tensors( input_data )

dataset = dataset.batch(2)

text_layer.adapt(dataset)

process_text = dataset.map(text_layer)

emb_layer = layers.Embedding(10, 10)

emb_layer(process_text) # error

error:

 AttributeError: Exception encountered when calling layer "embedding_7" (type Embedding).

'MapDataset' object has no attribute 'dtype'

Call arguments received:

 • inputs=<MapDataset element_spec=TensorSpec(shape=(None, 2, 10), dtype=tf.int64, name=None)>

How can I convert a tf.dataset to tf.tensor ?

This TensorFlow: convert tf.Dataset to tf.Tensor does not help me.

The above layers will be implemented in a machine learning neural network model.

loading data --> processing features (multiple text columns) --> tokens --> embedding --> average pooling --> some dense layers --> output layer

thanks

Solution

You cannot feed a tf.data.Dataset directly to an Embedding layer, you can either use .map(...):

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import re
import string 
def standardize(input_data):

    input_data = tf.strings.lower(input_data)
    input_data = tf.strings.regex_replace(input_data, f"[{re.escape(string.punctuation)}]", " ")
    return input_data

input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)

text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )

dataset = tf.data.Dataset.from_tensors( input_data )

dataset = dataset.batch(2).map(lambda x: tf.squeeze(x, axis=0))

text_layer.adapt(dataset)

process_text = dataset.map(text_layer)

emb_layer = layers.Embedding(10, 10)
process_text = process_text.map(emb_layer)

Or define your model and feed your dataset through model.fit(...):

import tensorflow as tf
import re
import string 
def standardize(input_data):

    input_data = tf.strings.lower(input_data)
    input_data = tf.strings.regex_replace(input_data, f"[{re.escape(string.punctuation)}]", " ")
    return input_data

input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)

text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )

dataset = tf.data.Dataset.from_tensors( input_data )

dataset = dataset.batch(2)

text_layer.adapt(dataset)

process_text = dataset.map(lambda x: (text_layer(tf.squeeze(x, axis=0)), tf.random.uniform((2, ), maxval=2, dtype=tf.int32))) # add random label to each entry

inputs = tf.keras.layers.Input((10, ))
emb_layer = tf.keras.layers.Embedding(10, 10)
x = emb_layer(inputs)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(1, 'sigmoid')(x)
model = tf.keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(process_text)