This is the code to train the spacy model for NER. My dataset is Arabic tweets JSON file. I tagged location manually labeled in my dataset by machine learning tools but the code is not running.
I used code from this link
############################################ NOTE ########################################################
# Creates NER training data in Spacy format from JSON downloaded from Dataturks.
# Outputs the Spacy training data which can be used for Spacy training.
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
training_data = []
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
annotations = data['annotation']
if annotations:
for annotation in annotations:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
# handle both list of labels or a single label.
if not isinstance(labels, list):
labels = [labels]
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
train data
TRAIN_DATA = convert_dataturks_to_spacy("/content/drive/My Drive/Colab Notebooks/Name Entity Recognition/NERTweets.json")
output of the first three tweets
[('طقس حضرموت صور اوليه سيول وادي رخيه',
{'entities': [(26, 35, 'loc'), (4, 10, 'city')]}),
('سيول وادي العف قرية هدى بمديرية حبان بمحافظة شبوة جنوب اليمن اليوم الاحد مايو م تصوير عدنان القميشي',
{'entities': [(55, 60, 'country'),
(50, 54, 'pre'),
(45, 49, 'city'),
(32, 36, 'loc'),
(20, 23, 'loc'),
(5, 14, 'loc')]}),
('اول مرة قابلته جدة جاها سيول', {'entities': [(15, 18, 'city')]})]
then the train spacey NER model
import spacy
import random
################### Train Spacy NER.###########
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("/content/drive/My Drive/Colab Notebooks/Name Entity Recognition/NERTweets.json");
nlp = spacy.blank('ar') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(1):
print("Statring iteration " + str(itn))
losses = {}
for text, annotations in TRAIN_DATA:
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
#do prediction
doc = nlp("Samsing mobiles below $100")
print ("Entities= " + str(["" + str(ent.text) + "_" + str(ent.label_) for ent in doc.ents]))
output error
Statring iteration 0
ValueError Traceback (most recent call last)
<ipython-input-8-6b61c2d740cf> in <module>()
----> 1 train_spacy()
2 frames
/usr/local/lib/python3.6/dist-packages/spacy/ in _format_docs_and_golds(self, docs, golds)
470 err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
471 raise ValueError(err)
--> 472 gold = GoldParse(doc, **gold)
473 doc_objs.append(doc)
474 gold_objs.append(gold)
gold.pyx in
gold.pyx in
ValueError: [E103] Trying to set conflicting doc.ents: '(42, 47, 'loc')' and '(34, 47, 'loc')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
my results upload on colab google in the link below. where is the problem?
spacy doesn't allow overlapping entities,you should remove the overlapping entities your code it will be:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
training_data = []
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
for annotation in data['annotation']:
point = annotation['points'][0]
label = annotation['label']
annotations.append((point['start'], point['end'] ,label,point['end']-point['start']))
annotations=sorted(annotations, key=lambda student: student[3],reverse=True)
seen_tokens = set()
for annotation in annotations:
if start not in seen_tokens and end - 1 not in seen_tokens:
seen_tokens.update(range(start, end))
if not isinstance(labels, list):
labels = [labels]
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((start, end+1 ,label))
training_data.append((text, {"entities" : entities})