Search code examples
pythonpandasxmldataframe

XML to DATAFRAME and back to XML in python


I would to convert XML file into pandas dataframe.

Afterwards I would like to convert the dataframe back to exactly the same XML.

[First step]: save the XLM file from string:

    xml = '''<system>
    <station id="1" type="DeviceName Functional Test">
      <stationId>Functional_Test</stationId>
    </station>
    <software>
      <component version="1.0.0.0" date="01/01/1889">Functional_Test.exe</component>
    </software>
</system>'''
element = ET.XML(xml)
ET.indent(element)
pretty_xml = ET.tostring(element, encoding='unicode')
    
xml_file_name = r"save_xml.xml"
xml_file=open(xml_file_name,"w")
xml_file.write(str(pretty_xml))
xml_file.close()

[Second step]: read the xml and save it as csv using pandas:

import pandas as pd
df = pd.read_xml(xml_file_name)
df.to_csv("xml_to_csv.csv")

enter image description here

[Third step]: convert dataframe back to XML:

back_to_xml = df.to_xml(index = False, root_name = "system", xml_declaration = False)
remove_rows = back_to_xml.replace("<row>", "").replace("</row>", "")
print(remove_rows)

Unfortunately the output XML is not the same as the input XML.

<system>
  
    <id>1.0</id>
    <type>DeviceName Functional Test</type>
    <stationId>Functional_Test</stationId>
    <component/>
  
  
    <id/>
    <type/>
    <stationId/>
    <component>Functional_Test.exe</component>
  
</system>

I would much apricate any support and guidance.


Solution

  • [Second step]: Save XML as DataFrame Using James code in How to convert an XML string to a dictionary? XML can be converted to well structured df.

    In my case:

    import xml.etree.ElementTree as ET
    
    class XmlListConfig(list):
        def __init__(self, aList):
            for element in aList:
                if element:
                    if len(element) == 1 or element[0].tag != element[1].tag:
                        self.append(XmlDictConfig(element))
                    elif element[0].tag == element[1].tag:
                        self.append(XmlListConfig(element))
                elif element.text:
                    text = element.text.strip()
                    if text:
                        self.append(text)
    
    
    class XmlDictConfig(dict):
        def __init__(self, parent_element):
            if parent_element.items():
                self.update(dict(parent_element.items()))
            for element in parent_element:
                if element:
                    if len(element) == 1 or element[0].tag != element[1].tag:
                        aDict = XmlDictConfig(element)
                    else:
                        aDict = {element[0].tag: XmlListConfig(element)}
                    if element.items():
                        aDict.update(dict(element.items()))
                        
                    self.update({element.tag: aDict})
                elif element.items():
                    self.update({element.tag: dict(element.items())})
                else:
                    self.update({element.tag: element.text})
    
    path = r"save_xml.xml"
    tree = ET.parse(path)
    root = tree.getroot()
    xml_dict = XmlDictConfig(root)
    
    df = pd.DataFrame.from_dict(xml_dict)
    csv_path = rf"{path[:-4]}.csv"
    df.to_csv(csv_path)
    

    [Third step]: using ElementTree structure convert dataframe back to XML:

    def parse_df_to_xml(csv_path):
        df = pd.read_csv(csv_path, index_col=0)
        root = ET.Element("system")
        for column in df.columns:
            row  = df[column]
            if column == "station":
                station = ET.SubElement(root, str(column), id = row.id, type = row.type)
                ET.SubElement(station, "stationId").text = row.stationId
            elif column == "software":
                software = ET.SubElement(root, str(column))
                component_dict = eval(row.component)
                ET.SubElement(software, "component", version = component_dict["version"], date = component_dict["date"]).text = "Functional_Test.exe"
        return root
    
    root = parse_df_to_xml(csv_path)
    
    tree = ET.ElementTree(root)
    ET.indent(tree, space="\t", level=0)
    tree.write("df_to_xml.xml")