My input data to the spacy ner model is in the BILUO
tagging scheme and I wish to use the same as a part of some requirement. When I try to train the model simply without a minibatch, it works fine (the commented part). But I am unable to figure out how to use minibatch and GoldParse here in order to raise the model's accuracy. Are my expectations valid here as I could not find a single example with this kind of combination? Also, I have already trained the model with the approach of start, end, label format. Please help me to figure out this section. My code is as below,
import spacy
from spacy.gold import offsets_from_biluo_tags
from spacy.gold import biluo_tags_from_offsets
import random
from spacy.util import minibatch, compounding
from os import path
from tqdm import tqdm
def train_spacy(data, iterations, model=None):
TRAIN_DATA = data
print(f"downloads = {model}")
if model is not None and path.exists(model):
print(f"training existing model")
nlp = spacy.load(model)
print("Model is Loaded '%s'" % model)
else:
print(f"Creating new model")
nlp = spacy.blank('en') # create blank Language class
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
# Based on template, get labels and save those for further training
LABEL = ["Name", "ORG"]
for i in LABEL:
# print(i)
ner.add_label(i)
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
if model is None:
optimizer = nlp.begin_training()
else:
optimizer = nlp.entity.create_optimizer()
tags = dict()
for itn in range(iterations):
print("Starting iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
# for text, annotations in tqdm(TRAIN_DATA):
# print(f"text={text}, an={annotations}")
# tags['entities'] = offsets_from_biluo_tags(nlp(text), annotations)
# print(f"a={tags}")
# nlp.update([text], # batch of texts
# [tags], # batch of annotations
# drop=0.5, # dropout - make it harder to memorise data
# sgd=optimizer, # callable to update weights
# losses=losses)
# print(losses)
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 16.0, 1.001))
# type 2 with mini batch
for batch in batches:
texts, annotations = zip(*batch)
print(texts)
tags = {'entities': annotations}
nlp.update(
texts, # batch of texts
[tags], # batch of annotations
drop=0.4, # dropout - make it harder to memorise data
losses=losses,
sgd=optimizer
)
print(losses)
return nlp
data_biluo = [
('I am Shah Khan, I work in MS Co', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'B-ORG', 'L-ORG']),
('I am Tom Tomb, I work in Telecom Networks', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'B-ORG', 'L-ORG'])
]
model = train_spacy(data_biluo, 10)
model.to_disk('./Vectors/')
You have 2 problems with your minibatch:
tags
should be an iterable of ner tags with offsetsdata_biluo
doesn't account for a ,
in the middle of the sentences.As soon as you correct for those you'r fine to go:
import spacy
from spacy.gold import offsets_from_biluo_tags, GoldParse
from spacy.util import minibatch, compounding
import random
from tqdm import tqdm
def train_spacy(data, iterations, model=None):
TRAIN_DATA = data
print(f"downloads = {model}")
if model is not None and path.exists(model):
print(f"training existing model")
nlp = spacy.load(model)
print("Model is Loaded '%s'" % model)
else:
print(f"Creating new model")
nlp = spacy.blank('en') # create blank Language class
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
# Based on template, get labels and save those for further training
LABEL = ["Name", "ORG"]
for i in LABEL:
# print(i)
ner.add_label(i)
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
if model is None:
optimizer = nlp.begin_training()
else:
optimizer = nlp.entity.create_optimizer()
tags = dict()
for itn in range(iterations):
print("Starting iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 16.0, 1.001))
# type 2 with mini batch
for batch in batches:
texts, _ = zip(*batch)
golds = [GoldParse(nlp.make_doc(t),entities = a) for t,a in batch]
nlp.update(
texts, # batch of texts
golds, # batch of annotations
drop=0.4, # dropout - make it harder to memorise data
losses=losses,
sgd=optimizer
)
print(losses)
return nlp
data_biluo = [
('I am Shah Khan, I work in MS Co', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'O', 'B-ORG', 'L-ORG']),
('I am Tom Tomb, I work in Telecom Networks', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'O', 'B-ORG', 'L-ORG'])
]
model = train_spacy(data_biluo, 10)
Starting iteration 0
{'ner': 17.999998331069946}
Starting iteration 1
{'ner': 16.6766300201416}
Starting iteration 2
{'ner': 16.997647166252136}
Starting iteration 3
{'ner': 16.486496448516846}
Starting iteration 4
{'ner': 15.695325374603271}
Starting iteration 5
{'ner': 14.312554001808167}
Starting iteration 6
{'ner': 12.099276185035706}
Starting iteration 7
{'ner': 11.473928153514862}
Starting iteration 8
{'ner': 8.814643770456314}
Starting iteration 9
{'ner': 7.233813941478729}