Search code examples
pythontensorflowtranslationtransformer-modelopennmt

Tensorflow "Transformer model for language understanding" with another Dataset?


I have been reading the official guide here (https://www.tensorflow.org/text/tutorials/transformer) to try and recreate the Vanilla Transformer in Tensorflow. I notice the dataset used is quite specific, and at the end of the guide, it says to try with a different dataset.

But that is where I have been stuck for a long time! I am trying to use the WMT14 dataset (as used in the original paper, Vaswani et. al.) here: https://www.tensorflow.org/datasets/catalog/wmt14_translate#wmt14_translatede-en .

I have also tried Multi30k and IWSLT dataset from Spacy, but are there any guides on how I can fit the dataset to what the model requires? Specifically, to tokenize it. The official TF guide uses a pretrained tokenizer, which is specific to the PR-EN dataset given.

model_name = "ted_hrlr_translate_pt_en_converter"

I am wondering, how I can use the TF (bert) tokenizer to tokenize the Spacy dataset? I have the code for PyTorch, unfortunately I do not know how to adapt it for Tensorflow. Any help would be greatly appreciated!

import spacy

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, 
                 eos_token = EOS_WORD, pad_token=BLANK_WORD)

MAX_LEN = 100
train, val, test = datasets.IWSLT.splits(
    exts=('.de', '.en'), fields=(SRC, TGT), 
    filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
        len(vars(x)['trg']) <= MAX_LEN)
MIN_FREQ = 2
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TGT.build_vocab(train.trg, min_freq=MIN_FREQ)

Solution

  • You can build your own tokenizer following this tutorial https://www.tensorflow.org/text/guide/subwords_tokenizer

    It is the exact same way they build the ted_hrlr_translate_pt_en_converter tokenizer in the transformers example, you just need to adjust it to your language.

    I rewrote it for your case but didn't test it:

    import collections
    import logging
    import os
    import pathlib
    import re
    import string
    import sys
    import time
    import numpy as np
    #import matplotlib.pyplot as plt
    
    import tensorflow_datasets as tfds
    import tensorflow_text as text
    import tensorflow as tf
    from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
    
    
    
    examples, metadata = tfds.load('wmt14_translate/de-en', with_info=True,
                                   as_supervised=True)
    train_examples, val_examples = examples['train'], examples['validation']
    
    for de_examples, en_examples in train_examples.batch(3).take(1):
      for pt in de_examples.numpy():
        print(pt.decode('utf-8'))
    
      print()
    
      for en in en_examples.numpy():
        print(en.decode('utf-8'))
    
    train_en = train_examples.map(lambda de, en: en)
    train_de = train_examples.map(lambda de, en: de)
    
    bert_tokenizer_params=dict(lower_case=True)
    reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]
    
    bert_vocab_args = dict(
        # The target vocabulary size
        vocab_size = 8000,
        # Reserved tokens that must be included in the vocabulary
        reserved_tokens=reserved_tokens,
        # Arguments for `text.BertTokenizer`
        bert_tokenizer_params=bert_tokenizer_params,
        # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
        learn_params={},
    )
    
    de_vocab = bert_vocab.bert_vocab_from_dataset(
        train_de.batch(1000).prefetch(2),
        **bert_vocab_args
    )
    
    print(de_vocab[:10])
    print(de_vocab[100:110])
    print(de_vocab[1000:1010])
    print(de_vocab[-10:])
    
    def write_vocab_file(filepath, vocab):
      with open(filepath, 'w') as f:
        for token in vocab:
          print(token, file=f)
    
    write_vocab_file('de_vocab.txt', de_vocab)
    
    en_vocab = bert_vocab.bert_vocab_from_dataset(
        train_en.batch(1000).prefetch(2),
        **bert_vocab_args
    )
    
    print(en_vocab[:10])
    print(en_vocab[100:110])
    print(en_vocab[1000:1010])
    print(en_vocab[-10:])
    
    write_vocab_file('en_vocab.txt', en_vocab)
    
    de_tokenizer = text.BertTokenizer('de_vocab.txt', **bert_tokenizer_params)
    en_tokenizer = text.BertTokenizer('en_vocab.txt', **bert_tokenizer_params)
    
    # Tokenize the examples -> (batch, word, word-piece)
    token_batch = en_tokenizer.tokenize(en_examples)
    # Merge the word and word-piece axes -> (batch, tokens)
    token_batch = token_batch.merge_dims(-2,-1)
    
    for ex in token_batch.to_list():
      print(ex)
    
    # Lookup each token id in the vocabulary.
    txt_tokens = tf.gather(en_vocab, token_batch)
    # Join with spaces.
    tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)
    
    words = en_tokenizer.detokenize(token_batch)
    tf.strings.reduce_join(words, separator=' ', axis=-1)
    
    START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
    END = tf.argmax(tf.constant(reserved_tokens) == "[END]")
    
    def add_start_end(ragged):
      count = ragged.bounding_shape()[0]
      starts = tf.fill([count,1], START)
      ends = tf.fill([count,1], END)
      return tf.concat([starts, ragged, ends], axis=1)
    
    words = en_tokenizer.detokenize(add_start_end(token_batch))
    tf.strings.reduce_join(words, separator=' ', axis=-1)
    
    def cleanup_text(reserved_tokens, token_txt):
      # Drop the reserved tokens, except for "[UNK]".
      bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
      bad_token_re = "|".join(bad_tokens)
    
      bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
      result = tf.ragged.boolean_mask(token_txt, ~bad_cells)
    
      # Join them into strings.
      result = tf.strings.reduce_join(result, separator=' ', axis=-1)
    
      return result
    
    token_batch = en_tokenizer.tokenize(en_examples).merge_dims(-2,-1)
    words = en_tokenizer.detokenize(token_batch)
    
    cleanup_text(reserved_tokens, words).numpy()
    
    class CustomTokenizer(tf.Module):
      def __init__(self, reserved_tokens, vocab_path):
        self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
        self._reserved_tokens = reserved_tokens
        self._vocab_path = tf.saved_model.Asset(vocab_path)
    
        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab = tf.Variable(vocab)
    
        ## Create the signatures for export:
    
        # Include a tokenize signature for a batch of strings.
        self.tokenize.get_concrete_function(
            tf.TensorSpec(shape=[None], dtype=tf.string))
    
        # Include `detokenize` and `lookup` signatures for:
        #   * `Tensors` with shapes [tokens] and [batch, tokens]
        #   * `RaggedTensors` with shape [batch, tokens]
        self.detokenize.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.detokenize.get_concrete_function(
              tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
    
        self.lookup.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(
              tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
    
        # These `get_*` methods take no arguments
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()
    
      @tf.function
      def tokenize(self, strings):
        enc = self.tokenizer.tokenize(strings)
        # Merge the `word` and `word-piece` axes.
        enc = enc.merge_dims(-2,-1)
        enc = add_start_end(enc)
        return enc
    
      @tf.function
      def detokenize(self, tokenized):
        words = self.tokenizer.detokenize(tokenized)
        return cleanup_text(self._reserved_tokens, words)
    
      @tf.function
      def lookup(self, token_ids):
        return tf.gather(self.vocab, token_ids)
    
      @tf.function
      def get_vocab_size(self):
        return tf.shape(self.vocab)[0]
    
      @tf.function
      def get_vocab_path(self):
        return self._vocab_path
    
      @tf.function
      def get_reserved_tokens(self):
        return tf.constant(self._reserved_tokens)
    
    tokenizers = tf.Module()
    tokenizers.pt = CustomTokenizer(reserved_tokens, 'de_vocab.txt')
    tokenizers.en = CustomTokenizer(reserved_tokens, 'en_vocab.txt')
    
    model_name = 'ted_hrlr_translate_de_en_converter'
    tf.saved_model.save(tokenizers, model_name)