Search code examples
pythonpython-3.xnltktokenizewordnet

How to find the lemmas and frequency count of each word in list of sentences in a list?


I want to find out the lemmas using WordNet Lemmatizer and also I need to compute each word frequency.

I am getting the following error.

The trace is as follows:

enter image description here

TypeError: unhashable type: 'list'

Note: The corpus is available on the nltk package itself.

What I have tried so far is as follows:

import nltk, re
import string
from collections import Counter
from string import punctuation
from nltk.tokenize import TweetTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import gutenberg, stopwords
from nltk.stem import WordNetLemmatizer

def remove_punctuation(from_text):
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in from_text]
    return stripped

def preprocessing():
    raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
    tokens_sentences = sent_tokenize(raw_data)
    tokens = [[word.lower() for word in line.split()] for line in tokens_sentences]
    print(len(tokens))
    global stripped_tokens
    stripped_tokens = [remove_punctuation(i) for i in tokens]
    sw = (stopwords.words('english'))
    filter_set = [[token for token in sentence if (token.lower() not in sw and token.isalnum())] for sentence in stripped_tokens]
    lemma= WordNetLemmatizer()
    global lem
    lem = []
    for w in filter_set:
        lem.append(lemma.lemmatize(w))

preprocessing()

Please help me in resolving the issue.


Solution

  • The problem is that lemma.lemmatize expects a string and you are passing a list. The elements of filter_set are lists. You need to change the line:

    lem.append(lemma.lemmatize(w))
    

    to something like this:

    lem.append([wi for wi in map(lemma.lemmatize, w)])
    

    The above code applies lemma.lemmatize to each token (wi) in w. Full code:

    import nltk, re
    import string
    from collections import Counter
    from string import punctuation
    from nltk.tokenize import TweetTokenizer, sent_tokenize, word_tokenize
    from nltk.corpus import gutenberg, stopwords
    from nltk.stem import WordNetLemmatizer
    
    
    def remove_punctuation(from_text):
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in from_text]
        return stripped
    
    
    def preprocessing():
        raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
        tokens_sentences = sent_tokenize(raw_data)
        tokens = [[word.lower() for word in line.split()] for line in tokens_sentences]
        print(len(tokens))
        stripped_tokens = [remove_punctuation(i) for i in tokens]
        sw = (stopwords.words('english'))
        filter_set = [[token for token in sentence if (token.lower() not in sw and token.isalnum())] for sentence in
                      stripped_tokens]
        lemma = WordNetLemmatizer()
        lem = []
        for w in filter_set:
            lem.append([wi for wi in map(lemma.lemmatize, w)])
    
        return lem
    
    result = preprocessing()
    for e in result[:10]:  # take the first 10 results
        print(e)
    

    Output

    ['tragedie', 'hamlet', 'william', 'shakespeare', '1599', 'actus', 'primus']
    ['scoena', 'prima']
    ['enter', 'barnardo', 'francisco', 'two', 'centinels']
    ['barnardo']
    ['who']
    ['fran']
    ['nay', 'answer', 'stand', 'vnfold', 'selfe', 'bar']
    ['long', 'liue', 'king', 'fran']
    ['barnardo']
    ['bar']
    

    UPDATE

    To get the frequencies you can use Counter:

    result = preprocessing()
    frequencies = Counter(word for sentence in result for word in sentence)
    for word, frequency in frequencies.most_common(10):  # get the 10 most frequent words
        print(word, frequency)
    

    Output

    ham 337
    lord 217
    king 180
    haue 175
    come 127
    let 107
    shall 107
    hamlet 107
    thou 105
    good 98