Search code examples
pythonxmlsax

Python XML Sax to dictionary


I'm trying to parse some relative large xml-files using the standard sax parser in Python and I would prefer to avoid manually save/check on each element to a dictionary because I'm working with multiple xml-schemas and some are quite large.

Obviously the code example below doesn't work, but it's what I got so far. Other low-memory solutions is also welcome.

(Note: the complete xml files contains more than just two levels of nested structures)

from xml import sax
from cStringIO import StringIO

xml_string = """<?xml version="1.0" encoding="iso-8859-1"?>
<n1:products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:n7="http://foo.bar.tld" xmlns:n1="http://foo.bar.tld">
  <n1:product>
    <n1:status>
      <n7:created>2005-02-08T18:18:30.53</n7:created>
      <n7:updated>2008-09-18T10:29:58.26</n7:updated>
    </n1:status>
    <n1:productid>28321503</n1:productid>
    <n1:producttext>
      <n7:text>Some product info</n7:text>
      <n7:type>Info</n7:type>
    </n1:producttext>
    <n1:terms>
      <n7:term>
        <n7:number>1</n7:number>
        <n7:name>Term1</n7:name>
      </n7:term>
      <n7:term>
        <n7:number>2</n7:number>
        <n7:name>Term2</n7:name>
      </n7:term>
    </n1:terms>   
  </n1:product>
</n1:products>
"""

class XML_Handler(sax.ContentHandler):    
    def __init__(self):
        self.data = {}
        self.vbuffer = ''
    def startElementNS(self, name, qname, attrs):
        (ns, localname) = name
        if localname == 'product':
            self.data = {}
            self.fetch = True
    def endElementNS(self, name, qname):
        (ns, localname) = name
        if localname == 'product':
            # Got my data, call some process function..
            print self.data
        elif self.fetch:
            if self.vbuffer != '':
                self.data[localname] = self.vbuffer
            else:
                pass
        self.vbuffer = ''
    def characters (self, ch):
        self.vbuffer += ch.rstrip()

if __name__ == '__main__':
    parser = sax.make_parser()
    parser.setContentHandler(XML_Handler())
    parser.setFeature(sax.handler.feature_namespaces, 1)
    inpsrc = sax.xmlreader.InputSource()
    inpsrc.setByteStream(StringIO(xml_string))
    parser.parse(inpsrc)

What I'm trying to achieve:

result = {
    'status' : {
        'created' : '2005-02-08T18:18:30.53',
        'updated' : '2008-09-18T10:29:58.26',
    },
    'productid' : '28321503',
    'producttext' : {
        'text' : 'Some product',
        'type' : 'Info',
    },
    'terms' : [{'number': '1', 'name': 'Term1'}, {'number': '2', 'name': 'Term2'}]
}

Solution

  • Finally got this working. It might not be the most robust solution, but good enough for my use case.

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    import simplejson as json
    from xml import sax
    try:
        from cStringIO import StringIO
    except ImportError:
        from StringIO import StringIO
    
    xml_string = '''<?xml version="1.0" encoding="iso-8859-1"?>
    <n1:products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:n7="http://foo.bar.tld" xmlns:n1="http://foo.bar.tld">
      <n1:product>
        <n1:status>
          <n7:created>2005-02-08T18:18:30.53</n7:created>
          <n7:updated>2008-09-18T10:29:58.26</n7:updated>
        </n1:status>
        <n1:productid>28321503</n1:productid>
        <n1:producttext>
          <n7:text>Some product info</n7:text>
          <n7:type>Info</n7:type>
        </n1:producttext>
        <n1:terms>
          <n7:term>
            <n7:number>1</n7:number>
            <n7:name>Term1</n7:name>
          </n7:term>
          <n7:term>
            <n7:number>2</n7:number>
            <n7:name>Term2</n7:name>
          </n7:term>
        </n1:terms>   
      </n1:product>
    </n1:products>
    '''
    
    def display(data):
        import pprint
        pp = pprint.PrettyPrinter(depth=10)
        pp.pprint(data)
    
    class Element:
        def setData(self, key, value):
            self.__dict__[key] = value
    
        def setObject(self, key, object):
            if key in self.__dict__ and not isinstance(self.__dict__[key], (list, tuple)):
                prev_object = self.__dict__[key]
                self.__dict__[key] = []
                self.__dict__[key].append(prev_object)
                self.__dict__[key].append(object)
            elif key in self.__dict__:
                self.__dict__[key].append(object)
            else:
                self.__dict__[key] = object
    
        def jsonable(self):
            return self._traverse(self.__dict__)
    
        # http://stackoverflow.com/questions/1036409/recursively-convert-python-object-graph-to-dictionary/1118038#1118038
        def _traverse(self, obj):
            if isinstance(obj, dict):
                for k in obj.keys():
                    obj[k] = self._traverse(obj[k])
                return obj
            elif hasattr(obj, "__iter__"):
                return [self._traverse(v) for v in obj]
            elif hasattr(obj, "__dict__"):
                data = dict([(key, self._traverse(value))
                    for key, value in obj.__dict__.iteritems()
                    if not callable(value) and not key.startswith('_')])
                return data
            else:
                return obj
    
    class ObjBuilder(sax.ContentHandler):
        def __init__(self, node):
            sax.ContentHandler.__init__(self)
            self.obj = []
            self.node = node
            self.fetch = False
            self.__buffer = ''
    
        def startElementNS(self, name, qname, attrs):
            (ns, localname) = name
            if self.node == localname:
                self.fetch = True
                o = Element()
                self.rootobject = o
                self.obj.append(o)
            elif self.fetch:
                self.__buffer = ''
                o = Element()
                self.obj[-1].setObject(localname, o)
                self.obj.append(o)
    
        def characters(self,contents):
            if self.fetch:
                self.__buffer += contents.strip()
    
        def endElementNS(self, name, qname):
            (ns, localname) = name
            if self.node == localname:
                self.fetch = False
                display(self.rootobject.jsonable())
                data = self.rootobject.jsonable()
            elif self.fetch:
                if self.__buffer != '':
                    self.obj[-2].setData(localname, self.__buffer)
                del self.obj[-1]
                self.__buffer = ''
    
    if __name__ == '__main__':
        parser = sax.make_parser()
        parser.setContentHandler(ObjBuilder('product'))
        parser.setFeature(sax.handler.feature_namespaces, 1)
    
        inpsrc = sax.xmlreader.InputSource()
        inpsrc.setByteStream(StringIO(xml_string))
        parser.parse(inpsrc)