I'm trying to solve a Name Entity Recognision(NER) Problem using SpaCy of the PDF files. I want to get the modal verbs(will, shall, should, must, etc..) from the pdf files.
I trained the data in spaCy. When predicting using the trained modal, the ent.sent.text
attribute of the modal usualy returns the text or can say the sentence from which the label extracted. But in my case it returns the label itself not the sentence. Anyone help me please.
The codes are giving below:
def load_training_data_from_csv(file_path):
nlp = spacy.load('en_core_web_md')
train_data = []
with open(file_path, 'r', encoding='cp1252') as f:
reader = csv.DictReader(f)
for row in reader:
sentence = row['text']
start, end = int(row['start']), int(row['end'])
label = row['label']
train_data.append((sentence, {"entities": [(start, end, label)]}))
# Check the alignment
from spacy.training import offsets_to_biluo_tags
doc = nlp.make_doc(sentence)
tags = offsets_to_biluo_tags(doc, [(start, end, label)])
if '-' in tags:
print(f"Warning: Misaligned entities in '{sentence}' with entities {[(start, end, label)]}")
return train_data
def train_spacy_ner(train_data, n_iter=10):
# Load the existing model
nlp = spacy.load('en_core_web_md')
# Add the NER pipeline if it doesn't exist
if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe("ner")
# Add the new label "CURRENCY" to the NER model
ner.add_label("WILL")
ner.add_label("SHALL")
ner.add_label("MUST")
# Train the NER model
optimizer = nlp.begin_training()
for i in range(n_iter):
print("Epoch - ", i) if i % 2 == 0 or i == n_iter else None
random.shuffle(train_data)
losses = {}
for text, annotations in train_data:
doc = nlp.make_doc(text)
example = spacy.training.Example.from_dict(doc, annotations)
nlp.update([example], sgd=optimizer, losses=losses)
print("loss : ", losses) if i % 2 == 0 or i == n_iter else None
return nlp
# nlp = spacy.load("en_core_web_md")
file_path = "/content/trainData.csv"
TRAIN_DATA = load_training_data_from_csv(file_path)
# Train the model
nlp = train_spacy_ner(TRAIN_DATA)
nlp.to_disk('custom_NER')
import spacy
nlp = spacy.load('custom_NER')
text = "The language will be in english"
doc = nlp(text)
# print(doc.ents)
for ent in doc.ents:
print(ent.sent.text, ent.start_char, ent.end_char, ent.label_)
ent.sent.text
should return the sentence used above. But here the label itself is returing.
will 13 17 WILL
The language will be in english 13 17 WILL
Text | start | end | label |
---|---|---|---|
I will do the procedures | 2 | 6 | will |
You should send the letters | 4 | 10 | should |
The reason is that calling the below code. So remove it from the train function
#optimizer = nlp.begin_training()
which will also reinitialize all models. As a result, the parser (which performs the sentence splitting), will predict the sentence boundaries using a zeroed-out softmax layer and will start detecting a boundary after every token.
So, should remove the line that calls begin_training
. Then later when you update the pipe, you can remove the sgd
parameter and the pipe will create an optimizer internally:
nlp.update([example], losses=losses)