This is the code to train the spacy model for NER. My dataset is Arabic tweets JSON file. I tagged location manually labeled in my dataset by https://dataturks.com machine learning tools but the code is not running.
I used code from this link https://dataturks.com/help/dataturks-ner-json-to-spacy-train.php
############################################ NOTE ########################################################
#
# Creates NER training data in Spacy format from JSON downloaded from Dataturks.
#
# Outputs the Spacy training data which can be used for Spacy training.
#
############################################################################################################
############################################################################################################
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
annotations = data['annotation']
if annotations:
for annotation in annotations:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
# handle both list of labels or a single label.
if not isinstance(labels, list):
labels = [labels]
#print(labels)
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
train data
TRAIN_DATA = convert_dataturks_to_spacy("/content/drive/My Drive/Colab Notebooks/Name Entity Recognition/NERTweets.json")
TRAIN_DATA
output of the first three tweets
[('طقس حضرموت صور اوليه سيول وادي رخيه',
{'entities': [(26, 35, 'loc'), (4, 10, 'city')]}),
('سيول وادي العف قرية هدى بمديرية حبان بمحافظة شبوة جنوب اليمن اليوم الاحد مايو م تصوير عدنان القميشي',
{'entities': [(55, 60, 'country'),
(50, 54, 'pre'),
(45, 49, 'city'),
(32, 36, 'loc'),
(20, 23, 'loc'),
(5, 14, 'loc')]}),
('اول مرة قابلته جدة جاها سيول', {'entities': [(15, 18, 'city')]})]
then the train spacey NER model
import spacy
import random
################### Train Spacy NER.###########
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("/content/drive/My Drive/Colab Notebooks/Name Entity Recognition/NERTweets.json");
nlp = spacy.blank('ar') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(1):
print("Statring iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
#do prediction
doc = nlp("Samsing mobiles below $100")
print ("Entities= " + str(["" + str(ent.text) + "_" + str(ent.label_) for ent in doc.ents]))
train_spacy
output error
Statring iteration 0
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-8-6b61c2d740cf> in <module>()
----> 1 train_spacy()
2 frames
/usr/local/lib/python3.6/dist-packages/spacy/language.py in _format_docs_and_golds(self, docs, golds)
470 err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
471 raise ValueError(err)
--> 472 gold = GoldParse(doc, **gold)
473 doc_objs.append(doc)
474 gold_objs.append(gold)
gold.pyx in spacy.gold.GoldParse.__init__()
gold.pyx in spacy.gold.biluo_tags_from_offsets()
ValueError: [E103] Trying to set conflicting doc.ents: '(42, 47, 'loc')' and '(34, 47, 'loc')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
my results upload on colab google in the link below. where is the problem?
https://drive.google.com/drive/folders/19t33kW4Dwtbv6s4vfMpa2kNwoVNzSu5I
spacy doesn't allow overlapping entities,you should remove the overlapping entities your code it will be:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
#line=lines[0]
data = json.loads(line)
text = data['content']
entities = []
annotations=[]
for annotation in data['annotation']:
point = annotation['points'][0]
label = annotation['label']
annotations.append((point['start'], point['end'] ,label,point['end']-point['start']))
annotations=sorted(annotations, key=lambda student: student[3],reverse=True)
seen_tokens = set()
for annotation in annotations:
start=annotation[0]
end=annotation[1]
labels=annotation[2]
if start not in seen_tokens and end - 1 not in seen_tokens:
seen_tokens.update(range(start, end))
if not isinstance(labels, list):
labels = [labels]
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((start, end+1 ,label))
training_data.append((text, {"entities" : entities})