Search code examples
pythonxmllxmlsequenceiterparse

Iterparse big XML, with low memory footprint, and get all, even nested, Sequence Elements


I have written a small python script to parse XML data based on Liza Daly's blog in Python. However, my code does not parse all the nodes. So for example when a person has had multiple addresses then it takes only the first available address. The XML tree would look like this:

- lgs
    - entities
        - entity
            - id
            - name
            - addressess
                - address
                    - address1
                - address
                    - address1
        - entity
            - id
              (...)

and this would be the python script:

import os
import time
from datetime import datetime
import lxml.etree as ET
import pandas as pd

xml_file = '.\\FILE.XML'

file_name, file_extension = os.path.splitext(os.path.basename(xml_file))

def fast_iter(context, *args, **kwargs):
# iterate through the XML file and create a dictionary 
    xml_dict = {  "Id":[]
    , "name":[]
    , "address":[]
    , "type":[] }

    def try_to_append(xml_column, node):
        # find and append XML nodes to the empty dictionary
        try:
            xml_dict[xml_column].append(elem.find(node).text)
        except:
            xml_dict[xml_column].append('')

    for event, elem in context:
        try_to_append('Id', 'id')
        try_to_append('name', 'name')
        try_to_append('address', 'addresses/address/address1'
        elem.clear()

        for ancestor in elem.xpath("ancestor-or-self::*"):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
        del context
        return xml_dict

context = ET.iterparse(xml_file, tag='entity')
xml_dict = fast_iter(context)
df = pd.DataFrame(xml_dict)

So what I get is:

ID | name     | address        | Type
1  | John Doe | Pythonstreet 4 | A

But John Doe has 3 addresses on his name where he has lived. So I would expect:

ID | name     | address        | Type
1  | John Doe | Pythonstreet 4 | A
1  | John Doe | SQL street 33  | A
1  | John Doe | C++ street 99  | A

And as you can see, because the only extra node is address, the rest does not change and only the address is added extra. SQL Server would do this automatically when you use OPENROWSET() with BULK. However, this file is greater than 3GB so that wouldn't work.

UPDATE: XML Example (fake addresses)

<?xml version='1.0' encoding='UTF-8'?>
<ffl>
<version>12345</version>
<entities>

    <entity id="1124353" version="12345">
        <name>DAVID, Beckham</name>
        <footId>1021</footId>
        <footCode>FIF</footCode>
        <createdDate>09/02/1991</createdDate>

        <source>FIF</source>
        <OriginalSource>FIFA</OriginalSource>
        <pobs>
            <pob>Leytonstone, London, United Kingdom</pob>
        </pobs>
        <dobs>
            <dob>May 02, 1975</dob>
        </dobs>
        <titles>
            <title>Football player</title>
        </titles>
        <addresses>
            <address>
                <address></address>
                <city>London</city>
                <country>UK</country>
                <countryName>UNITED KINGDOM</countryName>
                <postalCode>SE10 0JF</postalCode>
            </address>
            <address>
                <address1>35-37 Parkgate Road</address1>
                <city>London</city>
                <country>UK</country>
                <countryName>UNITED KINGDOM</countryName>
                <postalCode>SW11 4NP</postalCode>
            </address>
        </addresses>
    </entity>

    <entity id="1184359" version="12345">
        <name>CRISTIANO, Ronaldo</name>
        <footId>1022</footId>
        <footCode>FIF</footCode>
        <createdDate>20/03/2003</createdDate>
        <source>FIF</source>
        <OriginalSource>FIFA</OriginalSource>
        <pobs>
            <pob>Funchal, Madeira, Portugal</pob>
        </pobs>
        <dobs>
            <dob>February 05, 1985</dob>
        </dobs>
        <titles>
            <title>Football player</title>
        </titles>
        <addresses>
            <address>
                <address>Avenida da Boavista 1837</address>
                <city>Porto</city>
                <country>PT</country>
                <countryName>PORTUGAL</countryName>
                <postalCode>4100-133</postalCode>
            </address>
            <address>
                <address1>Extramuros 74</address1>
                <city>Madrid</city>
                <country>ES</country>
                <countryName>Spain</countryName>
                <postalCode>28400</postalCode>
            </address>
            <address>
                <address1>Viale Certosa 29</address1>
                <city>Turin</city>
                <country>IT</country>
                <countryName>Italy</countryName>
                <postalCode>10093</postalCode>
            </address>
        </addresses>
    </entity>

    <entity id="1984359" version="12345">
        <name>LIONEL, Messi</name>
        <footId>1023</footId>
        <footCode>FIF</footCode>
        <createdDate>09/02/2008</createdDate>
        <source>FIF</source>
        <OriginalSource>FIFA</OriginalSource>
        <pobs>
            <pob>Rosario, Argentina</pob>
        </pobs>
        <dobs>
            <dob>June 24, 1987</dob>
        </dobs>
        <titles>
            <title>Football player</title>
        </titles>
        <addresses>
            <address>
                <address>Almeyra 2588</address>
                <city>San Martin</city>
                <state>Buenos Aires</state>
                <country>AR</country>
                <countryName>ARGENTINA</countryName>
                <postalCode>N/A</postalCode>
            </address>
            <address>
                <address1>Comandante Izarduy 67</address1>
                <city>Barcelona</city>
                <country>ES</country>
                <countryName>SPAIN</countryName>
                <postalCode>08940</postalCode>
            </address>
            <address>
                <address1>Humahuaca 4425</address1>
                <city>Buenos Aires</city>
                <country>AR</country>
                <countryName>ARGENTINA</countryName>
                <postalCode>N/A</postalCode>
            </address>
        </addresses>
    </entity>

</entities>

</ffl>

Solution

  • Comment: As it now only outputs results

    Outputing results are only for demonstration, tracing and debuging.
    To write a record and addresses into a SQL database, for example using sqlite3, do:

    c.execute("INSERT INTO entity(id, name) VALUES(:id, :name)", record)
    addresses = []
    for addr in record['addresses']:
        addr[1].update({'id': record['id']})
        addresses.append(addr[1])
    c.executemany("INSERT INTO adresses(id, address, city) VALUES(:id, :address, :city)", addresses)
    

    To flatten for pandas
    Preconditon outside the loop: df = pd.DataFrame()

    from copy import copy
    
    addresses = copy(record['addresses'])
    del record['addresses']
    
    df_records = []
    for addr in addresses:
        record.update(addr[1])
        df_records.append(record)
    
    df = df.append(df_records, ignore_index=True)
    

    Question: Use etree.iterparse to include all nodes in XML file

    The following class Entity do:

    • Parse the XML File using lxml.etree.iterparse.
    • There is no File size limit, as the <entity>...</entity> Element Tree are deleted after processing.
    • Builds from every <entity>...</entity> Tree a dict {tag, value, ...}.
    • Using of generator objects to yield the dict.
    • Sequence Elements, e.g. <addresses>/<address> are List of Tuple [(address, {tag, text})....

    ToDo:

    • To flatten into many Records, loop record['addresses']
    • To equal different tag names: address and address1
    • To flatten, Sequence tags, e.g. <titels>, <probs> and <dobs>

    from lxml import etree
    
    class Entity:
        def __init__(self, fh):
            """
            Initialize 'iterparse' to only generate 'end' events on tag '<entity>'
    
            :param fh: File Handle from the XML File to parse
            """
            self.context = etree.iterparse(fh, events=("end",), tag=['entity'])
    
        def _parse(self):
            """
            Parse the XML File for all '<entity>...</entity>' Elements
            Clear/Delete the Element Tree after processing
    
            :return: Yield the current '<entity>...</entity>' Element Tree
            """
            for event, elem in self.context:
                yield elem
    
                elem.clear()
                while elem.getprevious() is not None:
                    del elem.getparent()[0]
    
        def sequence(self, elements):
            """
            Expand a Sequence Element, e.g. <titels> to a Tuple ('titel', text).
            If found a nested Sequence Element, e.g. <address>,
              to a Tuple ('address', {tag, text})
    
            :param elements: The Sequence Element
            :return: List of Tuple [(tag1, value), (tag2, value), ... ,(tagn, value))
            """
            _elements = []
            for elem in elements:
                if len(elem):
                    _elements.append((elem.tag, dict(self.sequence(elem))))
                else:
                    _elements.append((elem.tag, elem.text))
    
            return _elements
    
        def __iter__(self):
            """
            Iterate all '<entity>...</entity>' Element Trees yielded from self._parse()
    
            :return: Dict var 'entity' {tag1, value, tag2, value, ... ,tagn, value}}
            """
            for xml_entity in self._parse():
                entity = {'id': xml_entity.attrib['id']}
    
                for elem in xml_entity:
                    # if elem is Sequence
                    if len(elem):
                        # Append tuple(tag, value)
                        entity[elem.tag] = self.sequence(elem)
                    else:
                        entity[elem.tag] = elem.text
    
                yield entity
    
    if __name__ == "__main__":
        with open('.\\FILE.XML', 'rb') as in_xml_
            for record in Entity(in_xml):
                print("record:{}".format(record))
    
                for key, value in record.items():
                    if isinstance(value, (list)):
                        #print_list(key, value)
                        print("{}:{}".format(key, value))
                    else:
                        print("{}:{}".format(key, value))
    

    Output: Shows only the first Record and only 4 fields.
    Note: There is a pitfall with unique tag names: address and address1

    record:{'id': '1124353', 'titles': {'title': 'Foot... (omitted for brevity)
    id:1124353
    name:DAVID, Beckham
    titles:[('title', 'Football player')]
    addresses:
        address:{'city': 'London', 'address': None, 'post... (omitted for brevity)
        address:{'city': 'London', 'address1': '35-37 Par... (omitted for brevity)
    

    Tested with Python: 3.5 - lxml.etree: 3.7.1