Search code examples
pythondictionarynested

How to iterate through nested dictionaries and extract substructures and filter out certain keys?


I have nested dictionaries, which are representations of XML that I have parsed using xmltodict.

Now I want to give the possibility to extract certain sub-structures and remove keys which contain '@'.

{'rpc': {'@xmlns': 'urn:1.0',
  '@message-id': '4',
  'edit-config': {'target': {'running': None},
   'config': {'test': {'@xmlns': 'urn:2.0',
     'common': {'abc': 'forward',
      'bbc': {'geo-model': 'texas',
       'remote': [{'id': '288',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'},
        {'distributed-system-id': '318',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'},
        {'distributed-system-id': '348',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'}]}}}}}}}

I have defined a function where the user should be able to provide an input to choose which tag they want to extract the substructure from, while also removing any key with '@' in it.

def process_xml_dict(d,clean_d,start_after_tag = None,reached_tag=False):
    if start_after_tag == None:
        for k, v in d.items():
            if isinstance(v, dict):
                process_xml_dict(v,clean_d)
            else:
                if '@' not in k:
                    clean_d[k] = v
    else:
        for k,v in d.items():
            if isinstance(v, dict):
                if k == start_after_tag:
                    reached_tag = True
                process_xml_dict(v,clean_d,start_after_tag,reached_tag)
            else:
                if '@' not in k and reached_tag:
                    clean_d[k] = v

But it does not work

   clean_d = dict()
   process_xml_dict(d,clean_d)
   print(clean_d)

Should output

{'rpc': {
  'edit-config': {'target': {'running': None},
   'config': {'test': {
     'common': {'abc': 'forward',
      'bbc': {'geo-model': 'texas',
       'remote': [{'id': '288',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'},
        {'distributed-system-id': '318',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'},
        {'distributed-system-id': '348',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'}]}}}}}}}

But now it outputs

{
 'running': None,
 'abc': 'forward',
 'geo-model': 'texas',
 'remote': [{'id': '288',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'},
  {'distributed-system-id': '318',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'},
  {'distributed-system-id': '348',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'}]}

And if I input

clean_d = dict()
process_xml_dict(d,clean_d,start_after_tag='config')
print(clean_d)

It should output

{'test': {'common': {'abc': 'forward',
    'bbc': {'geo-model': 'texas',
    'remote': [{'id': '288',
      'transport': 'tcp',
      'port': '8',
      'ipv4-address': '0.0.0.0'},
     {'distributed-id': '318',
      'transport': 'tcp',
      'port': '8',
      'ipv4-address': '0.0.0.0'},
     {'distributed-id': '348',
      'transport': 'tcp',
      'port': '8',
      'ipv4-address': '0.0.0.0'}]}}}}

but now it outputs

{'abc': 'forward',
 'geo-model': 'texas',
 'remote': [{'id': '288',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'},
  {'distributed-id': '318',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'},
  {'distributed-id': '348',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'}]}

What am I doing wrong? And how would I modify my function to output expected output?

Thankful for any input.


Solution

  • This will work for you

    orig_dict = {'rpc': {'@xmlns': 'urn:1.0',
      '@message-id': '4',
      'edit-config': {'target': {'running': None},
       'config': {'test': {'@xmlns': 'urn:2.0',
         'common': {'abc': 'forward',
          'bbc': {'geo-model': 'texas',
           'remote': [{'id': '288',
             'transport': 'tcp',
             'port': '8',
             'ipv4-address': '0.0.0.0'},
            {'distributed-system-id': '318',
             'transport': 'tcp',
             'port': '8',
             'ipv4-address': '0.0.0.0'},
            {'distributed-system-id': '348',
             'transport': 'tcp',
             'port': '8',
             'ipv4-address': '0.0.0.0'}]}}}}}}}
    
    def get_filter_dict(dict_):
        res = {}
        if isinstance(dict_, dict):
            for k,v in dict_.items():
                if isinstance(v, dict):
                    res[k] = get_filter_dict(v)
                else:
                    if not k.startswith("@"):
                        res[k] = v
        return res
    
    def get_dict_start_after_tag(dict_, res=[None], start_after_tag=None):
        if start_after_tag:
            for k, v in dict_.items():
                if k == start_after_tag:
                    res[0] = get_filter_dict(v)
                    return res[0]
                if isinstance(v, dict):
                    _ =  get_dict_start_after_tag(v, res, start_after_tag)
        else:
            res[0] = get_filter_dict(dict_)
            return res
    
    res = [None]
    get_dict_start_after_tag(orig_dict, res)
    # {'rpc': {'edit-config': {'target': {'running': None}, 'config': {'test': {'common': {'abc': 'forward', 'bbc': {'geo-model': 'texas', 'remote': [{'id': '288', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}, {'distributed-system-id': '318', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}, {'distributed-system-id': '348', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}]}}}}}}}
    get_dict_start_after_tag(orig_dict, res, "config")
    # {'test': {'common': {'abc': 'forward', 'bbc': {'geo-model': 'texas', 'remote': [{'id': '288', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}, {'distributed-system-id': '318', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}, {'distributed-system-id': '348', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}]}}}}
    
    print(res[0])