Search code examples
pythonxmlxml.etreeiterparse

Python iterparse large XML while filtering with elements and children


I am attempting to parse product data from icecat. The data comes in large xml files. (3-7gb).

In order to reduce the amount of product data I am bringing in, I need to filter this list before moving to my next step. Particularly I need to filter by the "Updated" and "On_Market" values in the "file" element, and IF the "Country_Markets" child exists, I need to see if that (possible) list of children contains <"Country_Market = "US">

I am able to get the On_Market filter to work, but can't figure out how to structure the check of the Country_Markets child, or implement the date filter.

Where I am currently:

PYTHON:

from xml.etree.ElementTree import iterparse

file_path = 'index.xml'
dict_list = []
date = "20220803143328"

for _, elem in iterparse(file_path, events=("end",)):
    for child in elem:
        print(child)
        if child.tag == 'Country_Markets':
            if child.attrib['Country_Market'] == "US" OR child['Country_Markets'] is None:
                if elem.tag == "file":
                    if elem.attrib['On_Market'] == "1":
                        if elem.attrib['Updated']>= date: 
                            dict_list.append({'IceId': elem.attrib['Product_ID'],
                                            'LastUpdate': elem.attrib['Updated'],
                                            'PartNum': elem.attrib['Prod_ID'],
                                            'OnMarket': elem.attrib['On_Market']})

        elem.clear()


df = pd.DataFrame(dict_list)

EDIT NEW PYTHON APPROACH

I am trying to implement a different package which seems to do the trick, except I haven't been able to figure out how to incorporate the EAN_UPCS into my output...

from lxml import etree

context = etree.iterparse(file_path,  events=("start", "end"),)
for event, elem in context:
    if elem.tag == 'file':
        for child1 in elem:
            if child1.tag == 'Country_Markets':
                for child2 in child1:
                    if child2.attrib['Value'] == "US":
                        if elem.attrib['On_Market'] == "1":
                            if elem.attrib['Updated']>= "20220803143328":
                                print(f"'IceId': {elem.attrib['Product_ID']}")
                                print(f"'LastUpdate': {elem.attrib['Updated']}")
                                print(f"'PartNum': {elem.attrib['Prod_ID']}")
                                print(f"'OnMarket': {elem.attrib['On_Market']}")

                                if child1.tag == 'EAN_UPCS':
                                    for child2 in child1:
                                        if child2.attrib['IsApproved']  == "1":
                                            print(child2.attrib['Value'])

EDIT 2:

I can get the result I am after if I run segments of the loop, but once I run the entire loop, I either lose information, or append all information.

dict_list = []

context = etree.iterparse(file_path,  events=("end",))
for event, elem in context:
    if elem.tag == 'file':
        id = elem.attrib['Product_ID']
        print(id)
        valid = "no"
        dist = ()
        last_date = ()
        market = ()
        upc_list = []
        if elem.attrib['On_Market'] == "1":
            market = "yes"
        if elem.attrib['Updated']>= "20170803143328":
            last_date = "yes"
        for child1 in elem:
            if child1.tag =='Country_Markets': 
                print('markets found')
                for child2 in child1:
                    if child2.attrib['Value'] == "US":
                        dist = "yes"
                        
                    else:
                        if dist != "yes":
                            dist = "no"
                    #had to come up with a way to not overwrite the distibution while iterating throug elements        
            elif elem.find("Country_Markets") is None:
                print("No Markets")
                dist = "yes"
         
            if child1.tag == 'EAN_UPCS':
                    for child2 in child1:
                        if child2.attrib['IsApproved']  == "1":
                            upc_list.append(child2.attrib['Value'])
                   
            
        print(id, dist, last_date, market, upc_list)     
        if dist == "yes" and  last_date == "yes" and market == "yes":   
            dict_list.append({elem.attrib['Product_ID']:{'PartNum':elem.attrib['Prod_ID'], 'Updated' : elem.attrib['Updated'], 'UPCs': upc_list}} )
        continue
    elem.clear()
del context

dict_list

XML:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ICECAT-interface SYSTEM "https://data.icecat.biz/dtd/files.index.dtd">
<!--source: Icecat.biz 2022-->
<ICECAT-interface xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://data.icecat.biz/xsd/files.index.xsd">
    <files.index Generated="20220930000002">
        <file path="export/level4/US/1402.xml" Limited="No" HighPic="https://images.icecat.biz/img/gallery/1402_9631004284.jpg" HighPicSize="744457" HighPicWidth="2670" HighPicHeight="1407" Product_ID="1402" Updated="20220212085227" Quality="ICECAT" Prod_ID="C4893A" Supplier_id="1" Catid="377" On_Market="1" Model_Name="80 Value Pack 350-ml Yellow DesignJet Ink Cartridge and Printhead" Product_View="92380" Date_Added="20051028000000">
            <Country_Markets>
                <Country_Market Value="BE"/>
                <Country_Market Value="FR"/>
                <Country_Market Value="US"/>
                <Country_Market Value="GB"/>
                <Country_Market Value="DE"/>
                <Country_Market Value="CH"/>
                <Country_Market Value="IT"/>
                <Country_Market Value="CA"/>
            </Country_Markets>
        </file>
        <file path="export/level4/US/1414.xml" Limited="No" HighPic="https://images.icecat.biz/img/norm/high/1414-HP.jpg" HighPicSize="43288" HighPicWidth="400" HighPicHeight="400" Product_ID="1414" Updated="20220711134129" Quality="ICECAT" Prod_ID="C6614NE" Supplier_id="1" Catid="377" On_Market="1" Model_Name="C6614NE" Product_View="98879" Date_Added="20051023000000">
        </file>
        <file path="export/level4/US/1415.xml" Limited="No" HighPic="https://images.icecat.biz/img/norm/high/1415-HP.jpg" HighPicSize="43235" HighPicWidth="400" HighPicHeight="400" Product_ID="1415" Updated="20190404035203" Quality="ICECAT" Prod_ID="51650CE" Supplier_id="1" Catid="377" On_Market="1" Model_Name="50 Cyan Inkjet Print Cartridge" Product_View="60706" Date_Added="20051023000000">
            <EAN_UPCS>
                <EAN_UPC Value="0088698200223" IsApproved="0" Format="GTIN-13"/>
                <EAN_UPC Value="088698200223" IsApproved="0" Format="GTIN-12"/>
            </EAN_UPCS>
            <Country_Markets>
                <Country_Market Value="BE"/>
                <Country_Market Value="DE"/>
                <Country_Market Value="IT"/>
                <Country_Market Value="UA"/>
                <Country_Market Value="DZ"/>
            </Country_Markets>
        </file>
        <file path="export/level4/US/7966778.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966778-4280.jpg" HighPicSize="814349" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966778" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC2G79-3X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 6GB (2GB x 3)" Product_View="7328" Date_Added="20110223000000">
            <EAN_UPCS>
                <EAN_UPC Value="4713435791172" IsApproved="1" Format="GTIN-13"/>
            </EAN_UPCS>
            <Country_Markets>
                <Country_Market Value="US"/>
            </Country_Markets>
        </file>
        <file path="export/level4/US/7966779.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966778-4280.jpg" HighPicSize="793195" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966779" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC4G79-3X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 6GB (2GB x 3)" Product_View="6515" Date_Added="20110223000000">
            <EAN_UPCS>
                <EAN_UPC Value="4713435791714" IsApproved="1" Format="GTIN-13"/>
            </EAN_UPCS>
            <Country_Markets>
                <Country_Market Value="LU"/>
                <Country_Market Value="CH"/>
            </Country_Markets>
        </file>
        <file path="export/level4/US/7966780.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966780-2331.jpg" HighPicSize="724700" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966780" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC4G79-2X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 8GB (4GB x 2)" Product_View="6902" Date_Added="20110223000000">
            <EAN_UPCS>
                <EAN_UPC Value="4713435791707" IsApproved="1" Format="GTIN-13"/>
            </EAN_UPCS>
            <Country_Markets>
                <Country_Market Value="LU"/>
                <Country_Market Value="CH"/>
            </Country_Markets>
        </file>
    </files.index>
</ICECAT-interface>

The goal is to be able to filter out any items that are older than a certain date, not on market, and explicitly not for sale in the US. (Items that don't have Country_Markets elements are to be included in the output.)


Solution

  • Consider this adjusted iterparse approach without the inner loops across child elements. Below uses flag variables to be turned on and off conditionally while walking down the tree. A dictionary of data is built and iteratively appended within <file>...</file> context.

    from lxml import etree 
    #from xml.etree import ElementTree as etree
    
    file_path = "Input.xml"
    files_data = []
    
    context = etree.iterparse(file_path,  events=("start", "end"),)
    for event, elem in context:
        if event == "start":
            if elem.tag == 'file':
                data = {}
                US_Flag = 0
                Date_Flag = 0
                if elem.attrib['Updated'] >= "20220803143328":
                    Date_Flag = 1
                    data['IceId'] = elem.attrib['Product_ID']
                    data['LastUpdate'] = elem.attrib['Updated']
                    data['PartNum'] = elem.attrib['Prod_ID']
                    data['OnMarket'] = elem.attrib['On_Market']
                    data['EAN_UPCS'] = []
    
            if elem.tag == "EAN_UPC" and Date_Flag == 1:
                if elem.attrib['IsApproved'] == "1":
                    data['EAN_UPCS'].append(elem.attrib['Value'])
    
            if elem.tag == "Country_Market" and Date_Flag == 1:
                data['Country_Market'] = "US"
                if elem.attrib["Value"] == "US":
                    US_Flag = 1
    
        if event == "end":
            if elem.tag == "file" and (US_Flag == 1 or "Country_Market" not in data):
                files_data.append(data)
    
            elem.clear()
    

    Output

    For demonstration, below results remove >= "20220803143328" condition since posted XML does not contain dates in that range

    from pprint import pprint
    
    pprint(files_data)
    [{'Country_Market': 'US',
      'EAN_UPCS': [],
      'IceId': '1402',
      'LastUpdate': '20220212085227',
      'OnMarket': '1',
      'PartNum': 'C4893A'},
     {'EAN_UPCS': [],
      'IceId': '1414',
      'LastUpdate': '20220711134129',
      'OnMarket': '1',
      'PartNum': 'C6614NE'},
     {'Country_Market': 'US',
      'EAN_UPCS': ['4713435791172'],
      'IceId': '7966778',
      'LastUpdate': '20201106094740',
      'OnMarket': '1',
      'PartNum': 'AX3U1600XC2G79-3X'}]