Search code examples
pythondictionarytreenltkchunking

Converting NLTK chunks to a list of dictionaries


I need to get a list of dictionaries from an example nltk tree below:

   (S
      I/PRP
      'll/MD
      have/VB
      (amount 1/CD)
      (plate pizza/NN)
      and/CC
      (amount 4/CD)
      (plate sandwiches/NNS))

The desired output is as follows

  [{amount: 1, plate: pizza}, {amount: 4, plate: sandwiches}]

I have tried the code below but I only get one dictionary list: [{amount: 4, plate: sandwiches}] It looks like the list does not append new entries, it only updates the same dictionary.

import nltk
from nltk.chunk import *
from nltk.chunk.util import *
from nltk.chunk.regexp import *
from nltk import Tree

training = []
hmm_tagger = HiddenMarkovModelTagger.train(training)
sentence = "I'll have 1 pizza and 4 sandwiches"
gram = r"""
plate:      {<NN|NNS>}                    
amount:    {<CD|DT>}
        """
cp = nltk.RegexpParser(gram)
for sent in sentence:
    tokens = nltk.word_tokenize(sent)
    taggex = hmm_tagger.tag(tokens)
treee = cp.parse(taggex)
iob_ts = tree2conlltags(treee)
tree = conlltags2tree(iob_ts)
def conversion(tree):
    dlist = []
    for leaf in tree:
        if type(leaf) == tuple:
            for leaf in tree:
                key = leaf.label()
                value = leaf[0][0]
                dlist =[dict(zip(key, value)) for leaf in tree]              
    return dlist

Solution

  • The main issue here is that you are not appending after each loop iteration, both inside and outside the conversion function.

    from nltk.chunk.regexp import RegexpParser
    from nltk import Tree, pos_tag
    from nltk.tokenize import word_tokenize, sent_tokenize
    
    gram = r"""
    plate:      {<NN|NNS>}                    
    amount:    {<CD|DT>}
    """
    cp = RegexpParser(gram)
    
    text = "I'll have 1 pizza and 4 sandwiches"
    
    def conversion(tree):
        dlist = []
        d = dict()
        for item in tree:
            if isinstance(item, Tree):
                d[item.label()] = ' '.join([l[0] for l in item.leaves()])
            else:
                dlist.append(d) if len(d)>0 else None
                d = dict()
        dlist.append(d) if len(d)>0 else None
        return dlist
    
    parsed_text = [cp.parse(pos_tag(word_tokenize(sent)))
                   for sent in sent_tokenize(text)]
    
    for tree in parsed_text:
        print(conversion(tree))
    #[{'amount': '1', 'plate': 'pizza'}, {'amount': '4', 'plate': 'sandwiches'}]