python nlp spacy named-entity-recognition

How can we use Spacy minibatch and GoldParse to train NER model using BILUO tagging scheme?

My input data to the spacy ner model is in the BILUO tagging scheme and I wish to use the same as a part of some requirement. When I try to train the model simply without a minibatch, it works fine (the commented part). But I am unable to figure out how to use minibatch and GoldParse here in order to raise the model's accuracy. Are my expectations valid here as I could not find a single example with this kind of combination? Also, I have already trained the model with the approach of start, end, label format. Please help me to figure out this section. My code is as below,

import spacy
from spacy.gold import offsets_from_biluo_tags
from spacy.gold import biluo_tags_from_offsets
import random
from spacy.util import minibatch, compounding
from os import path
from tqdm import tqdm


def train_spacy(data, iterations, model=None):
    TRAIN_DATA = data
    print(f"downloads = {model}")
    if model is not None and path.exists(model):
        print(f"training existing model")
        nlp = spacy.load(model)
        print("Model is Loaded '%s'" % model)
    else:
        print(f"Creating new model")

        nlp = spacy.blank('en')  # create blank Language class

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')

    # Based on template, get labels and save those for further training
    LABEL = ["Name", "ORG"]

    for i in LABEL:
        # print(i)
        ner.add_label(i)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.entity.create_optimizer()
        tags = dict()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            # for text, annotations in tqdm(TRAIN_DATA):
            #     print(f"text={text}, an={annotations}")
            #     tags['entities'] = offsets_from_biluo_tags(nlp(text), annotations)
            #     print(f"a={tags}")
            #     nlp.update([text],  # batch of texts
            #                [tags],  # batch of annotations
            #                drop=0.5,  # dropout - make it harder to memorise data
            #                sgd=optimizer,  # callable to update weights
            #                losses=losses)
            # print(losses)
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 16.0, 1.001))
            # type 2 with mini batch
            for batch in batches:
                texts, annotations = zip(*batch)
                print(texts)
                tags = {'entities': annotations}
                nlp.update(
                    texts,  # batch of texts
                    [tags],  # batch of annotations
                    drop=0.4,  # dropout - make it harder to memorise data
                    losses=losses,
                    sgd=optimizer
                )
            print(losses)
    return nlp

data_biluo = [
    ('I am Shah Khan, I work in MS Co', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'B-ORG', 'L-ORG']),
    ('I am Tom Tomb, I work in Telecom Networks', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'B-ORG', 'L-ORG'])
]


model = train_spacy(data_biluo, 10)
model.to_disk('./Vectors/')

Solution

You have 2 problems with your minibatch:

tags should be an iterable of ner tags with offsets
your data_biluo doesn't account for a , in the middle of the sentences.

As soon as you correct for those you'r fine to go:

import spacy
from spacy.gold import offsets_from_biluo_tags, GoldParse
from spacy.util import minibatch, compounding
import random
from tqdm import tqdm

def train_spacy(data, iterations, model=None):
    TRAIN_DATA = data
    print(f"downloads = {model}")
    if model is not None and path.exists(model):
        print(f"training existing model")
        nlp = spacy.load(model)
        print("Model is Loaded '%s'" % model)
    else:
        print(f"Creating new model")

        nlp = spacy.blank('en')  # create blank Language class

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')

    # Based on template, get labels and save those for further training
    LABEL = ["Name", "ORG"]

    for i in LABEL:
        # print(i)
        ner.add_label(i)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.entity.create_optimizer()
        tags = dict()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 16.0, 1.001))
            # type 2 with mini batch
            for batch in batches:
                texts, _ = zip(*batch)
                golds = [GoldParse(nlp.make_doc(t),entities = a) for t,a in batch]
                nlp.update(
                    texts,  # batch of texts
                    golds,  # batch of annotations
                    drop=0.4,  # dropout - make it harder to memorise data
                    losses=losses,
                    sgd=optimizer
                )
            print(losses)
    return nlp

data_biluo = [
    ('I am Shah Khan, I work in MS Co', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'O', 'B-ORG', 'L-ORG']),
    ('I am Tom Tomb, I work in Telecom Networks', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'O', 'B-ORG', 'L-ORG'])
]


model = train_spacy(data_biluo, 10)

Starting iteration 0
{'ner': 17.999998331069946}
Starting iteration 1
{'ner': 16.6766300201416}
Starting iteration 2
{'ner': 16.997647166252136}
Starting iteration 3
{'ner': 16.486496448516846}
Starting iteration 4
{'ner': 15.695325374603271}
Starting iteration 5
{'ner': 14.312554001808167}
Starting iteration 6
{'ner': 12.099276185035706}
Starting iteration 7
{'ner': 11.473928153514862}
Starting iteration 8
{'ner': 8.814643770456314}
Starting iteration 9
{'ner': 7.233813941478729}