Search code examples
pythontensorflowkerasdeep-learningnlp

How to format Ragged Tensor for Encoder-Decoder model?


I'm working on building a seq2seq model using encoder-decoder architecture for which I have built a tf.data.Dataset pipeline that reads the text from the directories, vectorizes using them tf.keras.layers.TextVectorization and preprocess it to be fed for model training. I'm not able to format my labels such that it is of the shape (None, seq_len, target_vocab_size). I tried using to map tf.utils.to_categorical to the labels but it won't work on the tensors. Strangely there is no material out there where there was a similar problem discussed. Below is my implementation:

BUFFER_SIZE = len(articles)
BATCH_SIZE = 64

train_raw = (tf.data.Dataset
             .from_tensor_slices((articles[is_train], summaries[is_train]))
             .shuffle(BUFFER_SIZE)
             .batch(BATCH_SIZE))

val_raw = (tf.data.Dataset
           .from_tensor_slices((articles[~is_train], summaries[~is_train]))
           .shuffle(BUFFER_SIZE)
           .batch(BATCH_SIZE))

context_vectorizer = tf.keras.layers.TextVectorization(
    standardize = tf_lower_and_split_punct,
    max_tokens = MAX_VOCAB_SIZE,
    ragged=True)

target_vectorizer = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=MAX_VOCAB_SIZE,
    ragged=True)

context_vectorizer.adapt(train_raw.map(lambda context, target: context))
target_vectorizer.adapt(train_raw.map(lambda context, target: target))

def preprocess_text(context, target):
    context = context_vectorizer(context).to_tensor()
    target = target_vectorizer(target)

    target_in = target[:,:-1].to_tensor()
    target_out = target[:,1:].to_tensor()
    # target_out = target[:,:-1]
    return (context, target_in), target_out

train_ds = train_raw.map(preprocess_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(preprocess_text, tf.data.AUTOTUNE)

def encoder(hsize, embed_dim=200):
    en_input_layer = Input(shape=(None,), name='encoder_input_layer', ragged=True)
    en_embed = Embedding(context_vectorizer.vocabulary_size()+1, output_dim=embed_dim, name='encoder_embedding_layer')
    en_embed_out = en_embed(en_input_layer)
    en_gru_1 = GRU(hsize, return_sequences=True, return_state=True, name='encoder_gru_layer_1')
    en_gru_1_out, en_gru_states = en_gru_1(en_embed_out)
    return en_input_layer, en_gru_1_out, en_gru_states

def decoder(hsize, encoder_states, embed_dim=200):
    de_input_layer = Input(shape=(None,), name='decoder_input_layer', ragged=True)
    de_embed = Embedding(target_vectorizer.vocabulary_size()+1, output_dim=embed_dim, name='decode_embedding_layer')
    de_embed_out = de_embed(de_input_layer)
    de_gru_1 = GRU(hsize, return_sequences=True, name='decoder_gru_layer_1')
    de_gru_1_out = de_gru_1(de_embed_out, initial_state=encoder_states)
    de_dense = TimeDistributed(Dense(target_vectorizer.vocabulary_size(), activation='softmax'), name='time_distributed_output_layer')
    de_preds = de_dense(de_gru_1_out)
    return de_input_layer, de_preds

hsize = 256

def create_model(hsize):
    en_input_layer, enc_out, enc_states = encoder(hsize)
    de_input_layer, de_preds = decoder(hsize, enc_states)
    model = Model(inputs=[en_input_layer, de_input_layer], outputs=de_preds)
    model.compile(optimizer='adam', loss='categorical_crossentropy',
                    metrics=["acc"])
    return model

### Model training

m = create_model(hsize)

history = m.fit(
        train_ds.repeat(),
        steps_per_epoch=100,
        epochs=100,
        validation_data=val_ds,
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint('./checkpoints_trial_1',
                                                save_weights_only=True),
            tf.keras.callbacks.EarlyStopping(patience=3)])

The model summary is below:


 Layer (type)                Output Shape                 Param #   Connected to                  
==================================================================================================
 encoder_input_layer (Input  [(None, None)]               0         []                            
 Layer)                                                                                           
                                                                                                  
 decoder_input_layer (Input  [(None, None)]               0         []                            
 Layer)                                                                                           
                                                                                                  
 encoder_embedding_layer (E  (None, None, 200)            437200    ['encoder_input_layer[0][0]'] 
 mbedding)                                                                                        
                                                                                                  
 decode_embedding_layer (Em  (None, None, 200)            244200    ['decoder_input_layer[0][0]'] 
 bedding)                                                                                         
                                                                                                  
 encoder_gru_layer_1 (GRU)   [(None, None, 256),          351744    ['encoder_embedding_layer[0][0
                              (None, 256)]                          ]']                           
                                                                                                  
 decoder_gru_layer_1 (GRU)   (None, None, 256)            351744    ['decode_embedding_layer[0][0]
                                                                    ',                            
                                                                     'encoder_gru_layer_1[0][1]'] 
                                                                                                  
 time_distributed_output_la  (None, None, 1220)           313540    ['decoder_gru_layer_1[0][0]'] 
 yer (TimeDistributed)                                                                            
                                                                                                  
==================================================================================================
Total params: 1698428 (6.48 MB)
Trainable params: 1698428 (6.48 MB)
Non-trainable params: 0 (0.00 Byte)
__________________________________________________________________________________________________

The model compile's fine but when I run the fit method I get the following error:

ValueError: Shapes (None, None) and (None, None, 1220) are incompatible

I'm struggling with defining the model's Input layers correctly, or preprocess_text output that would work with the model definition.


Solution

  • Reposting from above comment, to fix the above issue, we can either change the loss method that works on sparse vector or transform the target label to one-hot encoded. Below is the complete working code with some dummy data.

    make_one_hot = False # params: True, False
    
    num_articles = 1000
    num_summaries = 1000
    MAX_VOCAB_SIZE = 5000
    articles = np.array([f"Article {i}" for i in range(num_articles)])
    summaries = np.array([f"Summary {i}" for i in range(num_summaries)])
    is_train = np.random.rand(len(articles)) < 0.8
    
    def tf_lower_and_split_punct(text):
        text = tf.strings.lower(text)
        text = tf.strings.regex_replace(text, '[.?!,¿]', ' ')
        text = tf.strings.strip(text)
        text = tf.strings.join([' ', text, ' '])
        return text
    
    BUFFER_SIZE = len(articles)
    BATCH_SIZE = 64
    
    train_raw = (tf.data.Dataset
                 .from_tensor_slices((articles[is_train], summaries[is_train]))
                 .shuffle(BUFFER_SIZE)
                 .batch(BATCH_SIZE))
    
    val_raw = (tf.data.Dataset
               .from_tensor_slices((articles[~is_train], summaries[~is_train]))
               .shuffle(BUFFER_SIZE)
               .batch(BATCH_SIZE))
    
    context_vectorizer = tf.keras.layers.TextVectorization(
        standardize = tf_lower_and_split_punct,
        max_tokens = MAX_VOCAB_SIZE,
        ragged=True)
    
    target_vectorizer = tf.keras.layers.TextVectorization(
        standardize=tf_lower_and_split_punct,
        max_tokens=MAX_VOCAB_SIZE,
        ragged=True)
    
    context_vectorizer.adapt(train_raw.map(lambda context, target: context))
    target_vectorizer.adapt(train_raw.map(lambda context, target: target))
    
    def preprocess_text(context, target):
        context = context_vectorizer(context).to_tensor()
        target = target_vectorizer(target)
    
        target_in = target[:,:-1].to_tensor()
        target_out = target[:,1:].to_tensor()
        
        if make_one_hot:
            target_out = tf.one_hot(
                target_out, 
                depth=tf.cast(
                    target_vectorizer.vocabulary_size(), dtype='int32'
                )
            )
        return (context, target_in), target_out
    
    train_ds = train_raw.map(preprocess_text, tf.data.AUTOTUNE)
    val_ds = val_raw.map(preprocess_text, tf.data.AUTOTUNE)
    
    def create_model(hsize):
        en_input_layer, enc_out, enc_states = encoder(hsize)
        de_input_layer, de_preds = decoder(hsize, enc_states)
        model = Model(inputs=[en_input_layer, de_input_layer], outputs=de_preds)
        
        if make_one_hot:
            loss_fn = 'categorical_crossentropy'
        else:
            loss_fn = 'sparse_categorical_crossentropy'
        
        model.compile(
            optimizer='adam', 
            loss=loss_fn,
            metrics=["acc"]
        )
        return model
    
    
    model.fit(train_ds)
    5s 24ms/step - loss: 6.7114 - acc: 0.003
    <keras.callbacks.History at 0x7bfef0423f40>
    

    Reference

    Selecting loss and metrics for Tensorflow model