Search code examples
pythonxmljupyter-notebookxml-parsingdata-manipulation

Duplicating an XML element and adding it to a specific position in XML file using python


I have a xml file in which content looks like this:

xml_content_to_search =

<Document ProviderID="TD" DecimalMarker="comma" Website="https://erc-viewer.sap.com/">
<available_substances>
        <substance ID="0004" DD="14" MM="10" YYYY="2010">
            <SubName>0004</SubName>
            <url>./UN/0004.xml</url>
            <group>ADR0004_0101</group>
            <group>THP0004Y0101</group>
            <group>THC0004Y0101</group>
            <group>TRP0004Y0101</group>
            <group>TRC0004Y0101</group>
            <group>TIP0004Y0101</group>
            <group>TIC0004Y0101</group>
            <group>CTR0004Y0102</group>
            <group>CRP0004Y0102</group>
            <group>CRC0004Y0102</group>
            </substance>
        <substance ID="ADR0004_0101" DD="26" MM="10" YYYY="2022">
            <SubName>asa</SubName>
            <url>ADR/ADR0004_0101.xml</url>
        </substance>
        <substance ID="THP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd)</SubName>
            <url>THP/THP0004Y0101.xml</url>
        </substance>
        <substance ID="THC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>THC/THC0004Y0101.xml</url>
        </substance>
        <substance ID="TRP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRP/TRP0004Y0101.xml</url>
        </substance>
        <substance ID="TRC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>
        </available_substances>
        </Document>

I want to search for a specific substance id in xml file and then duplicate it and do some manipulation and I am able to implement it. But after duplicating I want to insert that duplicated element right below the substance id from which it was duplicated.

This is my code:

# Use the os.listdir() method to list all files in the specified folder and filter for XML files
for filename in os.listdir(IAC_files_path):
    if filename.endswith(".xml"):
        # Remove the ".xml" extension before adding to the list
        xml_file_names.append(os.path.splitext(filename)[0])

# Parse the XML content to search for <substance> elements with matching IDs
tree = ET.ElementTree(ET.fromstring(xml_content_to_search))
root = tree.getroot()

# Initialize a flag to check if at least one match is found
match_found = False

# Create a list to store duplicated <substance> elements
duplicated_substance_elements = []

# Iterate through the <substance> elements and search for matching IDs
for substance_element in root.findall(".//substance"):
    substance_id = substance_element.get("ID")
    print(f"Processing substance_id: {substance_id}")
    # Check if the ID without the extension is in the list
    base_substance_id = os.path.splitext(substance_id)[0]
    if base_substance_id in xml_file_names:
        # Print the XML file name found in the <substance> element's ID attribute
        print(f"Found XML file name '{substance_id}' in the other XML file.")
        match_found = True

        # Create a new <substance> element with modified attributes for IUC
        duplicate_substance_element_iuc = ET.Element("substance")
        duplicate_substance_element_iuc.set("ID", base_substance_id.replace("IAC", "IUC"))
        duplicate_substance_element_iuc.set("DD", substance_element.get("DD"))
        duplicate_substance_element_iuc.set("MM", substance_element.get("MM"))
        duplicate_substance_element_iuc.set("YYYY", substance_element.get("YYYY"))

        # Duplicate and modify the <SubName> element for IUC
        subname_element = substance_element.find("SubName")
        duplicate_subname_element_iuc = ET.Element("SubName")
        duplicate_subname_element_iuc.text = subname_element.text.replace("IAC", "IUC")
        duplicate_substance_element_iuc.append(duplicate_subname_element_iuc)

        # Duplicate and modify the <url> element for IUC
        url_element = substance_element.find("url")
        duplicate_url_element_iuc = ET.Element("url")
        duplicate_url_element_iuc.text = url_element.text.replace("IAC", "IUC")
        duplicate_substance_element_iuc.append(duplicate_url_element_iuc)

        # Insert the duplicated IUC <substance> element immediately after the original IAC element
        substance_element_index = list(root).index(substance_element)
        root.insert(substance_element_index + 1, duplicate_substance_element_iuc)

        # Create a new <substance> element with modified attributes for IEC
        duplicate_substance_element_iec = ET.Element("substance")
        duplicate_substance_element_iec.set("ID", base_substance_id.replace("IAC", "IEC"))
        duplicate_substance_element_iec.set("DD", substance_element.get("DD"))
        duplicate_substance_element_iec.set("MM", substance_element.get("MM"))
        duplicate_substance_element_iec.set("YYYY", substance_element.get("YYYY"))

        # Duplicate and modify the <SubName> element for IEC
        duplicate_subname_element_iec = ET.Element("SubName")
        duplicate_subname_element_iec.text = subname_element.text.replace("IAC", "IEC")
        duplicate_substance_element_iec.append(duplicate_subname_element_iec)

        # Duplicate and modify the <url> element for IEC
        duplicate_url_element_iec = ET.Element("url")
        duplicate_url_element_iec.text = url_element.text.replace("IAC", "IEC")
        duplicate_substance_element_iec.append(duplicate_url_element_iec)
        
         # Insert the duplicated IUC <substance> element immediately after the original IAC element
        substance_element_index = list(root).index(substance_element)
        root.insert(substance_element_index + 2, duplicate_substance_element_iec)

        # Append the duplicated IEC <substance> element to the list
        #duplicated_substance_elements.append(duplicate_substance_element_iec)

# Check if no matches were found and print "Not found" message
if not match_found:
    print("No XML file names were found in the other XML file.")
    
# # Append the duplicated IEC <substance> elements to the end
# for duplicate_element in duplicated_substance_elements:
#     root.append(duplicate_element)

# Print the modified XML content
modified_xml_content = ET.tostring(root, encoding="unicode")
print(modified_xml_content)

I am getting this error :

<Element 'substance' at 0x000002BF2DFE8720> is not in list

at this line of code

substance_element_index = list(root).index(substance_element)

My desired output is something like this:

<Document ProviderID="TD" DecimalMarker="comma" Website="https://erc-viewer.sap.com/">
<available_substances>
        <substance ID="0004" DD="14" MM="10" YYYY="2010">
            <SubName>0004</SubName>
            <url>./UN/0004.xml</url>
            <group>ADR0004_0101</group>
            <group>THP0004Y0101</group>
            <group>THC0004Y0101</group>
            <group>TRP0004Y0101</group>
            <group>TRC0004Y0101</group>
            <group>TIP0004Y0101</group>
            <group>TIC0004Y0101</group>
            <group>CTR0004Y0102</group>
            <group>CRP0004Y0102</group>
            <group>CRC0004Y0102</group>
            </substance>
        <substance ID="ADR0004_0101" DD="26" MM="10" YYYY="2022">
            <SubName>asa</SubName>
            <url>ADR/ADR0004_0101.xml</url>
        </substance>
        <substance ID="THP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd)</SubName>
            <url>THP/THP0004Y0101.xml</url>
        </substance>
        <substance ID="THC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>THC/THC0004Y0101.xml</url>
        </substance>
        <substance ID="TRP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRP/TRP0004Y0101.xml</url>
        </substance>
        <substance ID="TRC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>
        **<substance ID="IEC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>**
        </available_substances>
        </Document>

I have a xml file in which content looks like this:

xml_content_to_search =

<Document ProviderID="TD" DecimalMarker="comma" Website="https://erc-viewer.sap.com/">
<available_substances>
        <substance ID="0004" DD="14" MM="10" YYYY="2010">
            <SubName>0004</SubName>
            <url>./UN/0004.xml</url>
            <group>ADR0004_0101</group>
            <group>THP0004Y0101</group>
            <group>THC0004Y0101</group>
            <group>TRP0004Y0101</group>
            <group>TRC0004Y0101</group>
            <group>TIP0004Y0101</group>
            <group>TIC0004Y0101</group>
            <group>CTR0004Y0102</group>
            <group>CRP0004Y0102</group>
            <group>CRC0004Y0102</group>
            </substance>
        <substance ID="ADR0004_0101" DD="26" MM="10" YYYY="2022">
            <SubName>asa</SubName>
            <url>ADR/ADR0004_0101.xml</url>
        </substance>
        <substance ID="THP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd)</SubName>
            <url>THP/THP0004Y0101.xml</url>
        </substance>
        <substance ID="THC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>THC/THC0004Y0101.xml</url>
        </substance>
        <substance ID="TRP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRP/TRP0004Y0101.xml</url>
        </substance>
        <substance ID="TRC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>
        </available_substances>
        </Document>

I want to search for a specific substance id in xml file and then duplicate it and do some manipulation and I am able to implement it. But after duplicating I want to insert that duplicated element right below the substance id from which it was duplicated.

This is my code:

# Use the os.listdir() method to list all files in the specified folder and filter for XML files
for filename in os.listdir(IAC_files_path):
    if filename.endswith(".xml"):
        # Remove the ".xml" extension before adding to the list
        xml_file_names.append(os.path.splitext(filename)[0])

# Parse the XML content to search for <substance> elements with matching IDs
tree = ET.ElementTree(ET.fromstring(xml_content_to_search))
root = tree.getroot()

# Initialize a flag to check if at least one match is found
match_found = False

# Create a list to store duplicated <substance> elements
duplicated_substance_elements = []

# Iterate through the <substance> elements and search for matching IDs
for substance_element in root.findall(".//substance"):
    substance_id = substance_element.get("ID")
    print(f"Processing substance_id: {substance_id}")
    # Check if the ID without the extension is in the list
    base_substance_id = os.path.splitext(substance_id)[0]
    if base_substance_id in xml_file_names:
        # Print the XML file name found in the <substance> element's ID attribute
        print(f"Found XML file name '{substance_id}' in the other XML file.")
        match_found = True

        # Create a new <substance> element with modified attributes for IUC
        duplicate_substance_element_iuc = ET.Element("substance")
        duplicate_substance_element_iuc.set("ID", base_substance_id.replace("IAC", "IUC"))
        duplicate_substance_element_iuc.set("DD", substance_element.get("DD"))
        duplicate_substance_element_iuc.set("MM", substance_element.get("MM"))
        duplicate_substance_element_iuc.set("YYYY", substance_element.get("YYYY"))

        # Duplicate and modify the <SubName> element for IUC
        subname_element = substance_element.find("SubName")
        duplicate_subname_element_iuc = ET.Element("SubName")
        duplicate_subname_element_iuc.text = subname_element.text.replace("IAC", "IUC")
        duplicate_substance_element_iuc.append(duplicate_subname_element_iuc)

        # Duplicate and modify the <url> element for IUC
        url_element = substance_element.find("url")
        duplicate_url_element_iuc = ET.Element("url")
        duplicate_url_element_iuc.text = url_element.text.replace("IAC", "IUC")
        duplicate_substance_element_iuc.append(duplicate_url_element_iuc)

        # Insert the duplicated IUC <substance> element immediately after the original IAC element
        substance_element_index = list(root).index(substance_element)
        root.insert(substance_element_index + 1, duplicate_substance_element_iuc)

        # Create a new <substance> element with modified attributes for IEC
        duplicate_substance_element_iec = ET.Element("substance")
        duplicate_substance_element_iec.set("ID", base_substance_id.replace("IAC", "IEC"))
        duplicate_substance_element_iec.set("DD", substance_element.get("DD"))
        duplicate_substance_element_iec.set("MM", substance_element.get("MM"))
        duplicate_substance_element_iec.set("YYYY", substance_element.get("YYYY"))

        # Duplicate and modify the <SubName> element for IEC
        duplicate_subname_element_iec = ET.Element("SubName")
        duplicate_subname_element_iec.text = subname_element.text.replace("IAC", "IEC")
        duplicate_substance_element_iec.append(duplicate_subname_element_iec)

        # Duplicate and modify the <url> element for IEC
        duplicate_url_element_iec = ET.Element("url")
        duplicate_url_element_iec.text = url_element.text.replace("IAC", "IEC")
        duplicate_substance_element_iec.append(duplicate_url_element_iec)
        
         # Insert the duplicated IUC <substance> element immediately after the original IAC element
        substance_element_index = list(root).index(substance_element)
        root.insert(substance_element_index + 2, duplicate_substance_element_iec)

        # Append the duplicated IEC <substance> element to the list
        #duplicated_substance_elements.append(duplicate_substance_element_iec)

# Check if no matches were found and print "Not found" message
if not match_found:
    print("No XML file names were found in the other XML file.")
    
# # Append the duplicated IEC <substance> elements to the end
# for duplicate_element in duplicated_substance_elements:
#     root.append(duplicate_element)

# Print the modified XML content
modified_xml_content = ET.tostring(root, encoding="unicode")
print(modified_xml_content)

I am getting this error :

<Element 'substance' at 0x000002BF2DFE8720> is not in list

at this line of code

substance_element_index = list(root).index(substance_element)

My desired output is something like this:

<Document ProviderID="TD" DecimalMarker="comma" Website="https://erc-viewer.sap.com/">
<available_substances>
        <substance ID="0004" DD="14" MM="10" YYYY="2010">
            <SubName>0004</SubName>
            <url>./UN/0004.xml</url>
            <group>ADR0004_0101</group>
            <group>THP0004Y0101</group>
            <group>THC0004Y0101</group>
            <group>TRP0004Y0101</group>
            <group>TRC0004Y0101</group>
            <group>TIP0004Y0101</group>
            <group>TIC0004Y0101</group>
            <group>CTR0004Y0102</group>
            <group>CRP0004Y0102</group>
            <group>CRC0004Y0102</group>
            </substance>
        <substance ID="ADR0004_0101" DD="26" MM="10" YYYY="2022">
            <SubName>asa</SubName>
            <url>ADR/ADR0004_0101.xml</url>
        </substance>
        <substance ID="THP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd)</SubName>
            <url>THP/THP0004Y0101.xml</url>
        </substance>
        <substance ID="THC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>THC/THC0004Y0101.xml</url>
        </substance>
        <substance ID="TRP0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRP/TRP0004Y0101.xml</url>
        </substance>
        <substance ID="TRC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>
        **<substance ID="IEC0004Y0101" DD="26" MM="10" YYYY="2020">
            <SubName>asd</SubName>
            <url>TRC/TRC0004Y0101.xml</url>
        </substance>**
        </available_substances>
        </Document>

Solution

  • You can copy a element, change the content and insert it:

    import xml.etree.ElementTree as ET
    from copy import deepcopy
    
    tree = ET.parse('substance.xml')
    root = tree.getroot()
        
    sub = root.findall('.//substance')
    print(len(sub))
    
    co = deepcopy(sub[3])
    for elem in co.iter():
        if elem.tag == 'substance':
            elem.set('ID', 'THC0004Y0101_insert')
            elem.set('DD', '27')
            elem.set('MM', '11')
            elem.set('YYYY', '1998')
        if elem.tag == 'SubName':
            elem.text = 'iso'
        if elem.tag == 'url':
            elem.text = 'ISO/ADR0004_010x.xml'
            
    root.find('.//available_substances').insert(4, co)     
    
    ET.dump(root)
    

    Output:

    
    <Document ProviderID="TD" DecimalMarker="comma" Website="https://erc-viewer.sap.com/">
    <available_substances>
            <substance ID="0004" DD="14" MM="10" YYYY="2010">
                <SubName>0004</SubName>
                <url>./UN/0004.xml</url>
                <group>ADR0004_0101</group>
                <group>THP0004Y0101</group>
                <group>THC0004Y0101</group>
                <group>TRP0004Y0101</group>
                <group>TRC0004Y0101</group>
                <group>TIP0004Y0101</group>
                <group>TIC0004Y0101</group>
                <group>CTR0004Y0102</group>
                <group>CRP0004Y0102</group>
                <group>CRC0004Y0102</group>
                </substance>
            <substance ID="ADR0004_0101" DD="26" MM="10" YYYY="2022">
                <SubName>asa</SubName>
                <url>ADR/ADR0004_0101.xml</url>
            </substance>
            <substance ID="THP0004Y0101" DD="26" MM="10" YYYY="2020">
                <SubName>asd)</SubName>
                <url>THP/THP0004Y0101.xml</url>
            </substance>
            <substance ID="THC0004Y0101" DD="26" MM="10" YYYY="2020">
                <SubName>asd</SubName>
                <url>THC/THC0004Y0101.xml</url>
            </substance>
            <substance ID="THC0004Y0101_insert" DD="27" MM="11" YYYY="1998">
                <SubName>iso</SubName>
                <url>ISO/ADR0004_010x.xml</url>
            </substance>
            <substance ID="TRP0004Y0101" DD="26" MM="10" YYYY="2020">
                <SubName>asd</SubName>
                <url>TRP/TRP0004Y0101.xml</url>
            </substance>
            <substance ID="TRC0004Y0101" DD="26" MM="10" YYYY="2020">
                <SubName>asd</SubName>
                <url>TRC/TRC0004Y0101.xml</url>
            </substance>
            </available_substances>
            </Document>