I am trying to train a parser for custom semantics following the sample code from https://raw.githubusercontent.com/explosion/spaCy/master/examples/training/train_intent_parser.py
The idea is to get a non-projective parse so when I pass a text like: ROOT AAAA BBBB 12 21
12 becomes a child of AAAA and 21 becomes a child of BBBB. To test this I am training only this case and testing this same case but it doesn't seem to work, what I get as a response is:
[('ROOT', 'ROOT', 'ROOT'), ('AAAA', 'LETTERS', 'ROOT'), ('BBBB', 'LETTERS', 'ROOT'), ('12', 'NUMBERS', 'BBBB'), ('21', 'NUMBERS', 'BBBB')]
As you can see both numbers are dependent on BBBB when 12 should be dependent on AAAA.
The code I am using to train and test is:
import plac
import random
import spacy
from spacy.util import minibatch, compounding
TRAIN_DATA = list()
samples = 1000
for _ in range(samples):
sample = (
'ROOT AAAA BBBB 12 21',
{
'heads': [0, 0, 0, 1, 2],
'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
}
)
TRAIN_DATA.append(sample)
def test_model(nlp):
texts = ['ROOT AAAA BBBB 12 21']
docs = nlp.pipe(texts)
for doc in docs:
print(doc.text)
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
@plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
n_iter=("Number of training iterations", "option", "n", int),
)
# Just in case I am using the german model since it supports pseudo-projective parsing (https://explosion.ai/blog/german-model#word-order)
def main(model='de_core_news_sm', n_iter=15):
"""Load the model, set up the pipeline and train the parser."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# We'll use the built-in dependency parser class, but we want to create a
# fresh instance – just in case.
if "parser" in nlp.pipe_names:
nlp.remove_pipe("parser")
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser, first=True)
for text, annotations in TRAIN_DATA:
for dep in annotations.get("deps", []):
parser.add_label(dep)
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
test_model(nlp)
if __name__ == "__main__":
plac.call(main)
So, what am I doing wrong?
Thank you in advance for any help on this!
The problem is that the simple training example script isn't projectivitizing the training instances when initializing and training the model. The parsing algorithm itself can only handle projective parses, but if the parser component finds projectivized labels in its output, they're deprojectivitzed in a postprocessing step. You don't need to modify any parser settings (so starting with a German model makes no difference), just provide projectivized input in the right format.
The initial projectivization is handled automatically by the train CLI, which uses GoldCorpus.train_docs()
to prepare the training examples for nlp.update()
and sets make_projective=True
when creating the GoldParse
s. In general, I'd recommend switching to the train CLI (which also requires switching to the internal JSON training format, which is admittedly a minor hassle), because the train CLI sets a lot of better defaults.
However, a toy example also works fine as long as you create projectivized training examples (with GoldParse(make_projective=True
), add all the projectivized dependency labels to the parser, and train with Doc
and the projectivized GoldParse
input instead of the text/annotation input:
# tested with spaCy v2.2.4
import spacy
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse
TRAIN_DATA = [
(
'ROOT AAAA BBBB 12 21',
{
'heads': [0, 0, 0, 1, 2],
'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
}
)
]
samples = 200
def test_model(nlp):
texts = ["ROOT AAAA BBBB 12 21"]
for doc in nlp.pipe(texts):
print(doc.text)
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
spacy.displacy.serve(doc)
@plac.annotations(
n_iter=("Number of training iterations", "option", "n", int),
)
def main(n_iter=10):
"""Load the model, set up the pipeline and train the parser."""
nlp = spacy.blank("xx")
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser)
docs_golds = []
for text, annotation in TRAIN_DATA:
doc = nlp.make_doc(text)
gold = GoldParse(doc, **annotation, make_projective=True)
# add the projectivized labels
for dep in gold.labels:
parser.add_label(dep)
docs_golds.append((doc, gold))
# duplicate the training instances
docs_golds = docs_golds * samples
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training(min_action_freq=1)
for itn in range(n_iter):
random.shuffle(docs_golds)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(docs_golds, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
test_model(nlp)
if __name__ == "__main__":
plac.call(main)