Search code examples
spacydependency-parsing

How do I train a pseudo-projective parser on spaCy?


I am trying to train a parser for custom semantics following the sample code from https://raw.githubusercontent.com/explosion/spaCy/master/examples/training/train_intent_parser.py The idea is to get a non-projective parse so when I pass a text like: ROOT AAAA BBBB 12 21 12 becomes a child of AAAA and 21 becomes a child of BBBB. To test this I am training only this case and testing this same case but it doesn't seem to work, what I get as a response is:

[('ROOT', 'ROOT', 'ROOT'), ('AAAA', 'LETTERS', 'ROOT'), ('BBBB', 'LETTERS', 'ROOT'), ('12', 'NUMBERS', 'BBBB'), ('21', 'NUMBERS', 'BBBB')]

As you can see both numbers are dependent on BBBB when 12 should be dependent on AAAA.

The code I am using to train and test is:

import plac
import random
import spacy
from spacy.util import minibatch, compounding

TRAIN_DATA = list()

samples = 1000
for _ in range(samples):
    sample = (
        'ROOT AAAA BBBB 12 21',
        {
            'heads': [0, 0, 0, 1, 2],
            'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
        }
    )
    TRAIN_DATA.append(sample)

def test_model(nlp):
    texts = ['ROOT AAAA BBBB 12 21']
    docs = nlp.pipe(texts)
    for doc in docs:
        print(doc.text)
        print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])

@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    n_iter=("Number of training iterations", "option", "n", int),
)

#  Just in case I am using the german model since it supports pseudo-projective parsing (https://explosion.ai/blog/german-model#word-order)
def main(model='de_core_news_sm', n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # We'll use the built-in dependency parser class, but we want to create a
    # fresh instance – just in case.
    if "parser" in nlp.pipe_names:
        nlp.remove_pipe("parser")
    parser = nlp.create_pipe("parser")
    nlp.add_pipe(parser, first=True)

    for text, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_model(nlp)

if __name__ == "__main__":
    plac.call(main)

So, what am I doing wrong?

Thank you in advance for any help on this!


Solution

  • The problem is that the simple training example script isn't projectivitizing the training instances when initializing and training the model. The parsing algorithm itself can only handle projective parses, but if the parser component finds projectivized labels in its output, they're deprojectivitzed in a postprocessing step. You don't need to modify any parser settings (so starting with a German model makes no difference), just provide projectivized input in the right format.

    The initial projectivization is handled automatically by the train CLI, which uses GoldCorpus.train_docs() to prepare the training examples for nlp.update() and sets make_projective=True when creating the GoldParses. In general, I'd recommend switching to the train CLI (which also requires switching to the internal JSON training format, which is admittedly a minor hassle), because the train CLI sets a lot of better defaults.

    However, a toy example also works fine as long as you create projectivized training examples (with GoldParse(make_projective=True), add all the projectivized dependency labels to the parser, and train with Doc and the projectivized GoldParse input instead of the text/annotation input:

    # tested with spaCy v2.2.4
    import spacy
    from spacy.util import minibatch, compounding
    from spacy.gold import GoldParse
    
    TRAIN_DATA = [
        (
            'ROOT AAAA BBBB 12 21',
            {
                'heads': [0, 0, 0, 1, 2],
                'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
            }
        )
    ]
    
    samples = 200
    
    def test_model(nlp):
        texts = ["ROOT AAAA BBBB 12 21"]
        for doc in nlp.pipe(texts):
            print(doc.text)
            print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
            spacy.displacy.serve(doc)
    
    @plac.annotations(
        n_iter=("Number of training iterations", "option", "n", int),
    )
    
    def main(n_iter=10):
        """Load the model, set up the pipeline and train the parser."""
        nlp = spacy.blank("xx")
        parser = nlp.create_pipe("parser")
        nlp.add_pipe(parser)
    
        docs_golds = []
        for text, annotation in TRAIN_DATA:
            doc = nlp.make_doc(text)
            gold = GoldParse(doc, **annotation, make_projective=True)
            # add the projectivized labels
            for dep in gold.labels:
                parser.add_label(dep)
            docs_golds.append((doc, gold))
        # duplicate the training instances
        docs_golds = docs_golds * samples
    
        pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
        with nlp.disable_pipes(*other_pipes):  # only train parser
            optimizer = nlp.begin_training(min_action_freq=1)
            for itn in range(n_iter):
                random.shuffle(docs_golds)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(docs_golds, size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer, losses=losses)
                print("Losses", losses)
    
        # test the trained model
        test_model(nlp)
    
    if __name__ == "__main__":
        plac.call(main)
    

    nonprojective parse