Search code examples
pythonnlpspacypos-tagger

Custom spaCy tagger to tag all words that are in a dictionary


I'm trying spaCy to extract specific information from a text. So I need to configure a custom tokenizer to identify them and a custom tagger to label all the words that are in an external dictionary in JSON format.

The tokenizer worked on several attempts, but the labeler has been having problems when processing simple text. I hope that the label I will add to the words is a custom POS-Tag "UNM" and that I can attribute it to token.pos_ like all other labels "NOUN", "VERB", etc.

import requests

#keywords dictionary
dictionary = requests.get(
    "https://github.com/dglopes/NBR15575/raw/main/unidades_medidas.json").json()

    
#Creating the Custom Tagger
Doc.set_extension('pos_tag', default=None, force=True)

@Language.factory("keyword_pos_tagger")
class KeywordPosTagger:
   def __init__(self, name, nlp, keywords, pos_tag):
       self.keywords = keywords
       self.pos_tag = pos_tag
       #Doc.set_extension('pos_tag', default=None, force=True)

   def __call__(self, doc):
       for token in doc:
           if token.text in self.keywords:
               token._.pos_tag = self.pos_tag
       return doc

nlp = spacy.load('pt_core_news_md')


keywords = ('m²', 'm2', '(W/K)', 'ºC')
pos_tag = 'UNM' # substitua por seu rótulo POS

keyword_pos_tagger = KeywordPosTagger(nlp, 'keyword_pos_tagger', keywords, pos_tag)

config = {"nlp": nlp, "keywords": keywords, "pos_tag": pos_tag}

nlp.add_pipe('keyword_pos_tagger', config = config)

<main.KeywordPosTagger at 0x78d568e4cee0>

And when I use the custom tagger:

doc = nlp('A temperatura tem 159ºC ou 20 ºC. Também precisa ter 20m de largura e 14 m² de área, caso contrário terá 1 Kelvin (W/K)')
for token in doc:
   print(token.text, token._.pos_tag)

it returns this error

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-5-3c241e1c89fd> in <cell line: 1>()
----> 1 doc = nlp('A temperatura tem 159ºC ou 20 ºC. Também precisa ter 20m de largura e 14 m² de área, caso contrário terá 1 Kelvin (W/K)')
      2 for token in doc:
      3    print(token.text, token._.pos_tag)

4 frames
/usr/local/lib/python3.10/dist-packages/spacy/tokens/underscore.py in __setattr__(self, name, value)
     74     def __setattr__(self, name: str, value: Any):
     75         if name not in self._extensions:
---> 76             raise AttributeError(Errors.E047.format(name=name))
     77         default, method, getter, setter = self._extensions[name]
     78         if setter is not None:

AttributeError: [E047] Can't assign a value to unregistered extension attribute 'pos_tag'. Did you forget to call the `set_extension` method?

Solution

  • You need to provide the config settings in the add_pipe method through a config dict. In your code, the keyword_pos_tagger variable is a stranded component that's not actually added to the nlp pipeline. It shares the same vocab and you could use it for unit testing, but otherwise you can't add it to a pipeline when it's created like this.

    nlp.add_pipe("keyword_pos_tagger", config={"keywords": keywords, "pos_tag": pos_tag})
    

    Edited to expand answer:

    # tested with spacy==3.7.2
    import spacy
    from spacy.language import Language
    from spacy.tokens import Token
    
    # Creating the Custom Tagger
    Token.set_extension("pos_tag", default=None, force=True)
    
    
    @Language.factory("keyword_pos_tagger")
    class KeywordPosTagger:
        def __init__(self, name, nlp, keywords, pos_tag):
            self.keywords = keywords
            self.pos_tag = pos_tag
    
        def __call__(self, doc):
            for token in doc:
                if token.text in self.keywords:
                    token._.pos_tag = self.pos_tag
            return doc
    
    
    nlp = spacy.load("pt_core_news_md")
    
    keywords = ("m²", "m2", "(W/K)", "ºC")
    pos_tag = "UNM"  # substitua por seu rótulo POS
    
    config = {"keywords": keywords, "pos_tag": pos_tag}
    
    nlp.add_pipe("keyword_pos_tagger", config=config)
    
    doc = nlp(
        "A temperatura tem 159ºC ou 20 ºC. Também precisa ter 20m de largura e 14 m² de área, caso contrário terá 1 Kelvin (W/K)"
    )
    assert doc[16]._.pos_tag == "UNM"