Search code examples
tagsnltkparentheses

nltk parses parenthesis incorrectly


I'm tagging text to search for nouns and adjectives:

text = u"""Developed at the Vaccine and Gene Therapy Institute at the Oregon Health and Science University (OHSU), the vaccine proved successful in about fifty percent of the subjects tested and could lead to a human vaccine preventing the onset of HIV/AIDS and even cure patients currently on anti-retroviral drugs."""
nltk.pos_tag(nltk.word_tokenize(text))

This results in:

[('Developed', 'NNP'), ('at', 'IN'), ('the', 'DT'), ('Vaccine', 'NNP'), ('and', 'CC'), ('Gene', 'NNP'), ('Therapy', 'NNP'), ('Institute', 'NNP'), ('at', 'IN'), ('the', 'DT'), ('Oregon', 'NNP'), ('Health', 'NNP'), ('and', 'CC'), ('Science', 'NNP'), ('University', 'NNP'), ('(', 'NNP'), ('OHSU', 'NNP'), (')', 'NNP'), (',', ','), ('the', 'DT'), ('vaccine', 'NN'), ('proved', 'VBD'), ('successful', 'JJ'), ('in', 'IN'), ('about', 'IN'), ('fifty', 'JJ'), ('percent', 'NN'), ('of', 'IN'), ('the', 'DT'), ('subjects', 'NNS'), ('tested', 'VBD'), ('and', 'CC'), ('could', 'MD'), ('lead', 'VB'), ('to', 'TO'), ('a', 'DT'), ('human', 'NN'), ('vaccine', 'NN'), ('preventing', 'VBG'), ('the', 'DT'), ('onset', 'NN'), ('of', 'IN'), ('HIV/AIDS', 'NNS'), ('and', 'CC'), ('even', 'RB'), ('cure', 'NN'), ('patients', 'NNS'), ('currently', 'RB'), ('on', 'IN'), ('anti-retroviral', 'JJ'), ('drugs', 'NNS'), ('.', '.')]

Is there a built in way of correctly detecting parenthesis when tagging sentences?


Solution

  • If you know what you want to return as the tag value for the parens, then you can use a RegexpTagger to match the parens and fallback to the preferred tagger for all else.

    import nltk
    from nltk.data import load
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)  # same tagger as using nltk.pos_tag
    
    regexp_tagger = nltk.tag.RegexpTagger([(r'\(|\)', '--')], backoff = tagger)
    
    regexp_tagger.tag(nltk.word_tokenize(text))
    

    Result:

    [(u'Developed', 'NNP'), (u'at', 'IN'), (u'the', 'DT'), (u'Vaccine', 'NNP'), (u'and', 'CC'), (u'Gene', 'NNP'), (u'Therapy', 'NNP'), (u'Institute', 'NNP'), (u'at', 'IN'), (u'the', 'DT'), (u'Oregon', 'NNP'), (u'Health', 'NNP'), (u'and', 'CC'), (u'Science', 'NNP'), (u'University', 'NNP'), (u'(', '--'), (u'OHSU', 'NNP'), (u')', '--'), (u',', ','), (u'the', 'DT'), (u'vaccine', 'NN'), (u'proved', 'VBD'), (u'successful', 'JJ'), (u'in', 'IN'), (u'about', 'IN'), (u'fifty', 'JJ'), (u'percent', 'NN'), (u'of', 'IN'), (u'the', 'DT'), (u'subjects', 'NNS'), (u'tested', 'VBD'), (u'and', 'CC'), (u'could', 'MD'), (u'lead', 'VB'), (u'to', 'TO'), (u'a', 'DT'), (u'human', 'NN'), (u'vaccine', 'NN'), (u'preventing', 'VBG'), (u'the', 'DT'), (u'onset', 'NN'), (u'of', 'IN'), (u'HIV/AIDS', 'NNS'), (u'and', 'CC'), (u'even', 'RB'), (u'cure', 'NN'), (u'patients', 'NNS'), (u'currently', 'RB'), (u'on', 'IN'), (u'anti-retroviral', 'JJ'), (u'drugs', 'NNS'), (u'.', '.')]