Search code examples
pythonjsonallennlp

Parsing nested dictionary (allen nlp hierplane_tree)


I am trying to parse the JSON object that gets returned by the allennlp predictor. I was able to find a helpful function to find all of the children values, but what I really want to do with the dependencies, is given an entity "man" can I get the associated attributes from the JSON object.

Example sentence: "When I was walking to the park yesterday, I saw a man wearing a blue shirt."

The dependency tree has wearing, blue, shirt, etc. that is associated with the entity. How can I get the associated JSON block back for man in that structure? I am not sure how I can modify my helper function or develop another one to get that block out of the JSON output. Any help or suggestions would be greatly appreciated.

AllenNLP Code:

text = "When I was walking to the park yesterday, I saw a man wearing a blue shirt."
from allennlp.predictors.predictor import Predictor
import allennlp_models.structured_prediction
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz")
tree = predictor.predict(sentence=text)

tree = tree['hierplane_tree']
tree

Helper Function that will let me get the children values:

"""Extract nested values from a JSON tree."""


def json_extract(obj, key):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
                    arr.append(v)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)
    return values

The function can give me the values:

# Find every instance of `name` in a Python dictionary.
children = json_extract(tree, 'word')
print(children)

['walking', 'When', 'I', 'was', 'to', 'park', 'the', 'yesterday', ',', 'saw', 'I', 'man', 'a', 'wearing', 'shirt', 'a', 'blue', '.']

JSON Extract (What I want to try to get when I provide "man":

{'word': 'man',
        'nodeType': 'dep',
        'attributes': ['NOUN'],
        'link': 'dep',
        'spans': [{'start': 51, 'end': 55}],
        'children': [{'word': 'a',
          'nodeType': 'det',
          'attributes': ['DET'],
          'link': 'det',
          'spans': [{'start': 49, 'end': 51}]},
         {'word': 'wearing',
          'nodeType': 'dep',
          'attributes': ['VERB'],
          'link': 'dep',
          'spans': [{'start': 55, 'end': 63}],
          'children': [{'word': 'shirt',
            'nodeType': 'dep',
            'attributes': ['NOUN'],
            'link': 'dep',
            'spans': [{'start': 70, 'end': 76}],
            'children': [{'word': 'a',
              'nodeType': 'dep',
              'attributes': ['DET'],
              'link': 'dep',
              'spans': [{'start': 63, 'end': 65}]},
             {'word': 'blue',
              'nodeType': 'dep',
              'attributes': ['ADJ'],
              'link': 'dep',
              'spans': [{'start': 65, 'end': 70}]}]}]}]}]}]}

JSON Output:

{'text': 'When I was walking to the park yesterday , I saw a man wearing a blue shirt .',
 'root': {'word': 'walking',
  'nodeType': 'root',
  'attributes': ['VERB'],
  'link': 'root',
  'spans': [{'start': 11, 'end': 19}],
  'children': [{'word': 'When',
    'nodeType': 'dep',
    'attributes': ['ADV'],
    'link': 'dep',
    'spans': [{'start': 0, 'end': 5}]},
   {'word': 'I',
    'nodeType': 'nsubj',
    'attributes': ['PRON'],
    'link': 'nsubj',
    'spans': [{'start': 5, 'end': 7}]},
   {'word': 'was',
    'nodeType': 'aux',
    'attributes': ['AUX'],
    'link': 'aux',
    'spans': [{'start': 7, 'end': 11}]},
   {'word': 'to',
    'nodeType': 'prep',
    'attributes': ['ADP'],
    'link': 'prep',
    'spans': [{'start': 19, 'end': 22}],
    'children': [{'word': 'park',
      'nodeType': 'pobj',
      'attributes': ['NOUN'],
      'link': 'pobj',
      'spans': [{'start': 26, 'end': 31}],
      'children': [{'word': 'the',
        'nodeType': 'det',
        'attributes': ['DET'],
        'link': 'det',
        'spans': [{'start': 22, 'end': 26}]}]}]},
   {'word': 'yesterday',
    'nodeType': 'tmod',
    'attributes': ['NOUN'],
    'link': 'tmod',
    'spans': [{'start': 31, 'end': 41}]},
   {'word': ',',
    'nodeType': 'dep',
    'attributes': ['PUNCT'],
    'link': 'dep',
    'spans': [{'start': 41, 'end': 43}],
    'children': [{'word': 'saw',
      'nodeType': 'dep',
      'attributes': ['VERB'],
      'link': 'dep',
      'spans': [{'start': 45, 'end': 49}],
      'children': [{'word': 'I',
        'nodeType': 'nsubj',
        'attributes': ['PRON'],
        'link': 'nsubj',
        'spans': [{'start': 43, 'end': 45}]},
       {'word': 'man',
        'nodeType': 'dep',
        'attributes': ['NOUN'],
        'link': 'dep',
        'spans': [{'start': 51, 'end': 55}],
        'children': [{'word': 'a',
          'nodeType': 'det',
          'attributes': ['DET'],
          'link': 'det',
          'spans': [{'start': 49, 'end': 51}]},
         {'word': 'wearing',
          'nodeType': 'dep',
          'attributes': ['VERB'],
          'link': 'dep',
          'spans': [{'start': 55, 'end': 63}],
          'children': [{'word': 'shirt',
            'nodeType': 'dep',
            'attributes': ['NOUN'],
            'link': 'dep',
            'spans': [{'start': 70, 'end': 76}],
            'children': [{'word': 'a',
              'nodeType': 'dep',
              'attributes': ['DET'],
              'link': 'dep',
              'spans': [{'start': 63, 'end': 65}]},
             {'word': 'blue',
              'nodeType': 'dep',
              'attributes': ['ADJ'],
              'link': 'dep',
              'spans': [{'start': 65, 'end': 70}]}]}]}]}]}]},
   {'word': '.',
    'nodeType': 'punct',
    'attributes': ['PUNCT'],
    'link': 'punct',
    'spans': [{'start': 76, 'end': 78}]}]},
 'nodeTypeToStyle': {'root': ['color5', 'strong'],
  'dep': ['color5', 'strong'],
  'nsubj': ['color1'],
  'nsubjpass': ['color1'],
  'csubj': ['color1'],
  'csubjpass': ['color1'],
  'pobj': ['color2'],
  'dobj': ['color2'],
  'iobj': ['color2'],
  'mark': ['color2'],
  'pcomp': ['color2'],
  'xcomp': ['color2'],
  'ccomp': ['color2'],
  'acomp': ['color2'],
  'aux': ['color3'],
  'cop': ['color3'],
  'det': ['color3'],
  'conj': ['color3'],
  'cc': ['color3'],
  'prep': ['color3'],
  'number': ['color3'],
  'possesive': ['color3'],
  'poss': ['color3'],
  'discourse': ['color3'],
  'expletive': ['color3'],
  'prt': ['color3'],
  'advcl': ['color3'],
  'mod': ['color4'],
  'amod': ['color4'],
  'tmod': ['color4'],
  'quantmod': ['color4'],
  'npadvmod': ['color4'],
  'infmod': ['color4'],
  'advmod': ['color4'],
  'appos': ['color4'],
  'nn': ['color4'],
  'neg': ['color0'],
  'punct': ['color0']},
 'linkToPosition': {'nsubj': 'left',
  'nsubjpass': 'left',
  'csubj': 'left',
  'csubjpass': 'left',
  'pobj': 'right',
  'dobj': 'right',
  'iobj': 'right',
  'pcomp': 'right',
  'xcomp': 'right',
  'ccomp': 'right',
  'acomp': 'right'}}

Solution

  • This will no doubt need to be optimized and cleaned up, but it does enable you to parse the dependency tree from AllenNLP by items of interest (in this case man). Hopefully, this helps someone else out.

    From the text, by providing the key/value (word as key and man as value). You get:

    Helper Functions:

    def get_entity_attributes(obj, key, value):
        """Recursively fetch values from nested JSON."""
        arr = []
    
        def extract(obj, arr, key):
            """Recursively search for values of key in JSON tree."""
            if isinstance(obj, dict):
                for k, v in obj.items():
                    if isinstance(v, (dict, list)):
                        extract(v, arr, key)
            elif isinstance(obj, list):
                for item in obj:
                    if(isinstance(item,dict)):
                        ky,vl = key, value
                        if ky in item and vl == item[ky]:
    #                         print(type(item), item)
                            arr.append(item)
                    extract(item, arr, key)
            return arr
    
        values = extract(obj, arr, key)
        return values
    
    def parse_attributes(obj, key):
        """Recursively fetch values from nested JSON."""
        arr = []
    
        def extract(obj, arr, key):
            """Recursively search for values of key in JSON tree."""
            if isinstance(obj, dict):
                for k, v in obj.items():
                    if isinstance(v, (dict, list)):
                        extract(v, arr, key)
                    elif k == key:
                        arr.append(v)
            elif isinstance(obj, list):
                for item in obj:
                    extract(item, arr, key)
            return arr
    
        values = extract(obj, arr, key)
        return values
    
    # Create list of word tokens after removing stopwords
    def get_clean_list(entities):
        filtered_sentence = []
    
        for word in entities:
            lexeme = nlp.vocab[word]
            if not lexeme.is_stop and not lexeme.is_punct:
                filtered_sentence.append(word) 
        return filtered_sentence
    

    View Output:

    text = "When I was walking to the park yesterday, I saw a man wearing a blue shirt."
    tree = predictor.predict(sentence=text)
    
    key = "word"
    entity = "man"
    entities = get_entity_attributes(tree, key, entity)
    
    for ent in entities:
        if ent['nodeType'] == 'dep':
            attributes = parse_attributes(ent, key)
            clean_attributes = get_clean_list(attributes)
            clean_attributes.remove(entity)
            print(f'entity: {entity} Attributes: {clean_attributes}')
        else:
            attributes = parse_attributes(ent, key)
            clean_attributes = get_clean_list(attributes)
            clean_attributes.remove(entity)
            print(f'entity: {entity} Action Attributes: {clean_attributes}')
    

    Gives you:

    entity: man Attributes: ['wearing', 'shirt', 'blue']