Search code examples
pythonnlpnltk

Apply NLP WordNetLemmatizer on whole sentence show error with unknown pos


I want to Apply NLP WordNetLemmatizer on whole sentence. The problem is that I get an error:

KeyError: 'NNP'

Its like Im getting unknown 'pos' value, but I do not know why. I want to get base form of the words, but without 'pos' it does not work. Can you tell me what am I doing wrong?

import nltk

from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 

nltk.download('averaged_perceptron_tagger')

sentence = "I want to find the best way to lemmantize this sentence so that I can see better results of it"

taged_words = nltk.pos_tag(sentence)
print(taged_words)


lemmantised_sentence = []

lemmatizer = WordNetLemmatizer()
for word in taged_words:

     filtered_text_lemmantised =  lemmatizer.lemmatize(word[0], pos=word[1])
     print(filtered_text_lemmantised)

     lemmantised_sentence.append(filtered_text_lemmantised)

lemmantised_sentence = ' '.join(lemmantised_sentence)
print(lemmantised_sentence)

Solution

  • The sentence should be split before sending it to pos_tag function. Also, the pos argument differs in what kind of strings it accepts. It only accepts 'N','V' and so on. I have updated your code from this https://stackoverflow.com/a/15590384/7349991.

    import nltk
    
    from nltk.tokenize import PunktSentenceTokenizer
    from nltk.tokenize import word_tokenize
    from nltk.tokenize import RegexpTokenizer
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import wordnet
    
    def main():
        nltk.download('averaged_perceptron_tagger')
        nltk.download('wordnet')
    
        sentence = "I want to find the best way to lemmantize this sentence so that I can see better results of it"
    
        taged_words = nltk.pos_tag(sentence.split())
        print(taged_words)
    
        lemmantised_sentence = []
    
    
        lemmatizer = WordNetLemmatizer()
        for word in taged_words:
            if word[1]=='':
                continue
            filtered_text_lemmantised = lemmatizer.lemmatize(word[0], pos=get_wordnet_pos(word[1]))
            print(filtered_text_lemmantised)
    
            lemmantised_sentence.append(filtered_text_lemmantised)
    
        lemmantised_sentence = ' '.join(lemmantised_sentence)
        print(lemmantised_sentence)
    
    def get_wordnet_pos(treebank_tag):
    
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        else:
            return wordnet.ADV
    
    
    if __name__ == '__main__':
        main()