I'm trying spaCy to extract specific information from a text. So I need to configure a custom tokenizer to identify them and a custom tagger to label all the words that are in an external dictionary in JSON format.
The tokenizer worked on several attempts, but the labeler has been having problems when processing simple text. I hope that the label I will add to the words is a custom POS-Tag "UNM" and that I can attribute it to token.pos_ like all other labels "NOUN", "VERB", etc.
import requests
#keywords dictionary
dictionary = requests.get(
"https://github.com/dglopes/NBR15575/raw/main/unidades_medidas.json").json()
#Creating the Custom Tagger
Doc.set_extension('pos_tag', default=None, force=True)
@Language.factory("keyword_pos_tagger")
class KeywordPosTagger:
def __init__(self, name, nlp, keywords, pos_tag):
self.keywords = keywords
self.pos_tag = pos_tag
#Doc.set_extension('pos_tag', default=None, force=True)
def __call__(self, doc):
for token in doc:
if token.text in self.keywords:
token._.pos_tag = self.pos_tag
return doc
nlp = spacy.load('pt_core_news_md')
keywords = ('m²', 'm2', '(W/K)', 'ºC')
pos_tag = 'UNM' # substitua por seu rótulo POS
keyword_pos_tagger = KeywordPosTagger(nlp, 'keyword_pos_tagger', keywords, pos_tag)
config = {"nlp": nlp, "keywords": keywords, "pos_tag": pos_tag}
nlp.add_pipe('keyword_pos_tagger', config = config)
<main.KeywordPosTagger at 0x78d568e4cee0>
And when I use the custom tagger:
doc = nlp('A temperatura tem 159ºC ou 20 ºC. Também precisa ter 20m de largura e 14 m² de área, caso contrário terá 1 Kelvin (W/K)')
for token in doc:
print(token.text, token._.pos_tag)
it returns this error
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-5-3c241e1c89fd> in <cell line: 1>()
----> 1 doc = nlp('A temperatura tem 159ºC ou 20 ºC. Também precisa ter 20m de largura e 14 m² de área, caso contrário terá 1 Kelvin (W/K)')
2 for token in doc:
3 print(token.text, token._.pos_tag)
4 frames
/usr/local/lib/python3.10/dist-packages/spacy/tokens/underscore.py in __setattr__(self, name, value)
74 def __setattr__(self, name: str, value: Any):
75 if name not in self._extensions:
---> 76 raise AttributeError(Errors.E047.format(name=name))
77 default, method, getter, setter = self._extensions[name]
78 if setter is not None:
AttributeError: [E047] Can't assign a value to unregistered extension attribute 'pos_tag'. Did you forget to call the `set_extension` method?
You need to provide the config settings in the add_pipe
method through a config dict. In your code, the keyword_pos_tagger
variable is a stranded component that's not actually added to the nlp
pipeline. It shares the same vocab and you could use it for unit testing, but otherwise you can't add it to a pipeline when it's created like this.
nlp.add_pipe("keyword_pos_tagger", config={"keywords": keywords, "pos_tag": pos_tag})
Edited to expand answer:
# tested with spacy==3.7.2
import spacy
from spacy.language import Language
from spacy.tokens import Token
# Creating the Custom Tagger
Token.set_extension("pos_tag", default=None, force=True)
@Language.factory("keyword_pos_tagger")
class KeywordPosTagger:
def __init__(self, name, nlp, keywords, pos_tag):
self.keywords = keywords
self.pos_tag = pos_tag
def __call__(self, doc):
for token in doc:
if token.text in self.keywords:
token._.pos_tag = self.pos_tag
return doc
nlp = spacy.load("pt_core_news_md")
keywords = ("m²", "m2", "(W/K)", "ºC")
pos_tag = "UNM" # substitua por seu rótulo POS
config = {"keywords": keywords, "pos_tag": pos_tag}
nlp.add_pipe("keyword_pos_tagger", config=config)
doc = nlp(
"A temperatura tem 159ºC ou 20 ºC. Também precisa ter 20m de largura e 14 m² de área, caso contrário terá 1 Kelvin (W/K)"
)
assert doc[16]._.pos_tag == "UNM"