Search code examples
machine-learningspacytraining-datanamed-entity-recognition

Python code for training Arabic spacy NER model not giving result or errors


This is the code to train the spacy model for NER. My dataset is Arabic tweets JSON file. I tagged location manually labeled in my dataset by https://dataturks.com machine learning tools but the code is not running.

I used code from this link https://dataturks.com/help/dataturks-ner-json-to-spacy-train.php

    ############################################  NOTE  ########################################################
#
#           Creates NER training data in Spacy format from JSON downloaded from Dataturks.
#
#           Outputs the Spacy training data which can be used for Spacy training.
#
############################################################################################################
############################################################################################################
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r') as f:
        lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            annotations = data['annotation']
            if annotations:
                for annotation in annotations:
                    #only a single point in text annotation.
                    point = annotation['points'][0]
                    labels = annotation['label']

                    # handle both list of labels or a single label.
                    if not isinstance(labels, list):
                        labels = [labels]
                    #print(labels)
                    for label in labels:
                        #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                        entities.append((point['start'], point['end'] + 1 ,label))


                training_data.append((text, {"entities" : entities}))

    return training_data

train data

TRAIN_DATA = convert_dataturks_to_spacy("/content/drive/My Drive/Colab Notebooks/Name Entity Recognition/NERTweets.json")
TRAIN_DATA

output of the first three tweets

    [('طقس حضرموت صور اوليه سيول وادي رخيه',
  {'entities': [(26, 35, 'loc'), (4, 10, 'city')]}),
 ('سيول وادي العف قرية هدى بمديرية حبان بمحافظة شبوة جنوب اليمن اليوم الاحد مايو م تصوير عدنان القميشي',
  {'entities': [(55, 60, 'country'),
    (50, 54, 'pre'),
    (45, 49, 'city'),
    (32, 36, 'loc'),
    (20, 23, 'loc'),
    (5, 14, 'loc')]}),
 ('اول مرة قابلته جدة جاها سيول', {'entities': [(15, 18, 'city')]})]

then the train spacey NER model

import spacy
import random
################### Train Spacy NER.###########
def train_spacy():
    TRAIN_DATA = convert_dataturks_to_spacy("/content/drive/My Drive/Colab Notebooks/Name Entity Recognition/NERTweets.json");
    nlp = spacy.blank('ar')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(1):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)

    #do prediction
    doc = nlp("Samsing mobiles below $100")
    print ("Entities= " + str(["" + str(ent.text) + "_" + str(ent.label_) for ent in doc.ents]))

train_spacy

output error

Statring iteration 0
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-8-6b61c2d740cf> in <module>()
----> 1 train_spacy()

2 frames
/usr/local/lib/python3.6/dist-packages/spacy/language.py in _format_docs_and_golds(self, docs, golds)
    470                     err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
    471                     raise ValueError(err)
--> 472                 gold = GoldParse(doc, **gold)
    473             doc_objs.append(doc)
    474             gold_objs.append(gold)

gold.pyx in spacy.gold.GoldParse.__init__()

gold.pyx in spacy.gold.biluo_tags_from_offsets()

ValueError: [E103] Trying to set conflicting doc.ents: '(42, 47, 'loc')' and '(34, 47, 'loc')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.

my results upload on colab google in the link below. where is the problem?

https://drive.google.com/drive/folders/19t33kW4Dwtbv6s4vfMpa2kNwoVNzSu5I


Solution

  • spacy doesn't allow overlapping entities,you should remove the overlapping entities your code it will be:

    def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r') as f:
        lines = f.readlines()
        for line in lines:
            #line=lines[0]
            data = json.loads(line)
            text = data['content']
            entities = []                   
            annotations=[]
            for annotation in data['annotation']:
                point = annotation['points'][0]
                label = annotation['label']  
                annotations.append((point['start'], point['end'] ,label,point['end']-point['start']))
                
            annotations=sorted(annotations, key=lambda student: student[3],reverse=True) 
            seen_tokens = set() 
            for annotation in annotations:
    
                start=annotation[0]
                end=annotation[1]
                labels=annotation[2]
                if start not in seen_tokens and end - 1 not in seen_tokens:
         
                    seen_tokens.update(range(start, end)) 
                    if not isinstance(labels, list):
                        labels = [labels]
    
                    for label in labels:
                        #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                        entities.append((start, end+1  ,label))
    
    
            training_data.append((text, {"entities" : entities})