Search code examples
pythonxmltagsindentation

How to indent a specific line of an xml with Python?


I have 2 xml files.

The first one looks like this :

<?xml version="1.0" encoding="UTF-8"?>
<gmd:MD_Metadata xmlns:gmd="http://www.isotc211.org/2005/gmd" xmlns:gts="http://www.isotc211.org/2005/gts" xmlns:gmx="http://www.isotc211.org/2005/gmx" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:gml="http://www.opengis.net/gml" xmlns:gfc="http://www.isotc211.org/2005/gfc" xmlns:gco="http://www.isotc211.org/2005/gco" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:geonet="http://www.fao.org/geonetwork" xsi:schemaLocation="http://www.isotc211.org/2005/gmd http://www.isotc211.org/2005/gmd/gmd.xsd">
  <gmd:identificationInfo>
    <gmd:MD_DataIdentification>
      <gmd:citation>
        <gmd:CI_Citation>
          <gmd:identifier>
            <gmd:MD_Identifier>
              <gmd:code>
                <gco:CharacterString>blabla</gco:CharacterString>
              </gmd:code>
            </gmd:MD_Identifier>
          </gmd:identifier>
          <gmd:date>
            <gmd:CI_Date>
              <gmd:date>
                <gco:Date>date</gco:Date>
              </gmd:date>
              <gmd:dateType>
                <gmd:CI_DateTypeCode codeList="http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_19139_Schemas/resources/Codelist/ML_gmxCodelists.xml#CI_DateTypeCode" codeListValue="creation">
                                                blabla
                                            </gmd:CI_DateTypeCode>
              </gmd:dateType>
            </gmd:CI_Date>
          </gmd:date>
          <gmd:edition>
            <gco:CharacterString>6</gco:CharacterString>
          </gmd:edition>
          <gmd:editionDate>
            <gco:DateTime>date</gco:DateTime>
          </gmd:editionDate>
          <gmd:title>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:title>
        </gmd:CI_Citation>
      </gmd:citation>
      <gmd:abstract>
        <gco:CharacterString>blabla</gco:CharacterString>
      </gmd:abstract>
      <gmd:pointOfContact>
        <gmd:CI_ResponsibleParty>
          <gmd:organisationName>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:organisationName>
          <gmd:contactInfo>
            <gmd:CI_Contact>
              <gmd:address>
                <gmd:CI_Address>
                  <gmd:electronicMailAddress>
                    <gco:CharacterString>blabla</gco:CharacterString>
                  </gmd:electronicMailAddress>
                  <gmd:country>
                    <gco:CharacterString/>
                  </gmd:country>
                </gmd:CI_Address>
              </gmd:address>
            </gmd:CI_Contact>
          </gmd:contactInfo>
        </gmd:CI_ResponsibleParty>
      </gmd:pointOfContact>
      <gmd:descriptiveKeywords>
        <gmd:MD_Keywords>
          <gmd:keyword>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:keyword>
          <gmd:keyword>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:keyword>
          <gmd:keyword>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:keyword>
          <gmd:keyword>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:keyword>
          <gmd:keyword>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:keyword>
        </gmd:MD_Keywords>
      </gmd:descriptiveKeywords>
      <gmd:topicCategory>
        <gmd:MD_TopicCategoryCode>blabla</gmd:MD_TopicCategoryCode>
      </gmd:topicCategory>
      <gmd:extent>
        <gmd:EX_Extent>
          <gmd:description>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:description>
          <gmd:geographicElement>
            <gmd:EX_GeographicBoundingBox>
              <gmd:westBoundLongitude>
                <gco:Decimal>0</gco:Decimal>
              </gmd:westBoundLongitude>
              <gmd:eastBoundLongitude>
                <gco:Decimal>0</gco:Decimal>
              </gmd:eastBoundLongitude>
              <gmd:southBoundLatitude>
                <gco:Decimal>0</gco:Decimal>
              </gmd:southBoundLatitude>
              <gmd:northBoundLatitude>
                <gco:Decimal>0</gco:Decimal>
              </gmd:northBoundLatitude>
            </gmd:EX_GeographicBoundingBox>
          </gmd:geographicElement>
        </gmd:EX_Extent>
      </gmd:extent>
      <gmd:spatialResolution>
        <gmd:MD_Resolution>
          <gmd:distance>
            <gco:Distance uom="-"/>
          </gmd:distance>
          <gmd:equivalentScale>
            <gmd:MD_RepresentativeFraction>
              <gmd:denominator>
                <gco:Integer>blabla</gco:Integer>
              </gmd:denominator>
            </gmd:MD_RepresentativeFraction>
          </gmd:equivalentScale>
        </gmd:MD_Resolution>
      </gmd:spatialResolution>
      <gmd:resourceSpecificUsage>
        <gmd:MD_Usage>
          <gmd:specificUsage>
            <gco:CharacterString/>
          </gmd:specificUsage>
        </gmd:MD_Usage>
      </gmd:resourceSpecificUsage>
      <gmd:resourceConstraints>
        <gmd:MD_SecurityConstraints>
          <gmd:useLimitation>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:useLimitation>
          <gmd:classification>
            <gmd:MD_ClassificationCode codeListValue="restricted_FR">blabla</gmd:MD_ClassificationCode>
          </gmd:classification>
        </gmd:MD_SecurityConstraints>
        <gmd:MD_Constraints>
          <gmd:useLimitation>
            <gco:CharacterString/>
          </gmd:useLimitation>
        </gmd:MD_Constraints>
      </gmd:resourceConstraints>
      <gmd:resourceConstraints>
        <gmd:MD_LegalConstraints>
          <gmd:useLimitation>
            <gco:CharacterString/>
          </gmd:useLimitation>
          <gmd:accessConstraints>
            <gmd:MD_RestrictionCode codeListValue="restricted" codeList="http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_19139_Schemas/resources/codelist/ML_gmxCodelists.xml#MD_RestrictionCode"/>
          </gmd:accessConstraints>
          <gmd:useConstraints>
            <gmd:MD_RestrictionCode codeList="http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_19139_Schemas/resources/codelist/ML_gmxCodelists.xml#MD_RestrictionCode" codeListValue="restricted"/>
          </gmd:useConstraints>
          <gmd:otherConstraints>
            <gco:CharacterString>
                                        blabla
                                    </gco:CharacterString>
          </gmd:otherConstraints>
        </gmd:MD_LegalConstraints>
      </gmd:resourceConstraints>
      <gmd:resourceSpecificUsage>
        <gmd:MD_Usage>
          <gmd:specificUsage>
            <gco:CharacterString/>
          </gmd:specificUsage>
        </gmd:MD_Usage>
      </gmd:resourceSpecificUsage>
    </gmd:MD_DataIdentification>
  </gmd:identificationInfo>
  <gmd:referenceSystemInfo>
    <gmd:MD_ReferenceSystem>
      <gmd:referenceSystemIdentifier>
        <gmd:RS_Identifier>
          <gmd:code>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:code>
        </gmd:RS_Identifier>
      </gmd:referenceSystemIdentifier>
    </gmd:MD_ReferenceSystem>
  </gmd:referenceSystemInfo>
  <gmd:distributionInfo>
    <gmd:MD_Distribution>
      <gmd:distributionFormat>
        <gmd:MD_Format>
          <gmd:name>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:name>
          <gmd:version>
            <gco:CharacterString>blabla</gco:CharacterString>
          </gmd:version>
        </gmd:MD_Format>
      </gmd:distributionFormat>
    </gmd:MD_Distribution>
  </gmd:distributionInfo>
  <gmd:contact>
    <gmd:CI_ResponsibleParty>
      <gmd:organisationName>
        <gco:CharacterString>blabla</gco:CharacterString>
      </gmd:organisationName>
    </gmd:CI_ResponsibleParty>
  </gmd:contact>
  <isAvailableAsPaper>blabla</isAvailableAsPaper>
  <productKind>blabla</productKind>
  <visibility>blabla</visibility>
</gmd:MD_Metadata>

The second one looks like this :

<?xml version="1.0" encoding="UTF-8"?>
<metadataDefault>
  <visibility>blabla</visibility>
  <domain>L</domain>
  <catalogueEntitySubType>blabla</catalogueEntitySubType>
  <dataType1>01_blabla</dataType1>
  <dataType2>01-01_blabla</dataType2>
  <dataType3>01-01-1_blabla</dataType3>
  <isPermanent>true</isPermanent>
  <pointOfContact>blabla</pointOfContact>
  <author>blabla</author>
  <authorCountry>blabla</authorCountry>
  <rawMetaDataProfile>blabla</rawMetaDataProfile>
</metadataDefault>

I would like to add the tags from the second xml to the end of the first one without having to download additional libraries.
I've managed to do it with xml.etree.ElementTree but the first tag added doesn't have the right indentation.

My Python code is as follows :

import xml.etree.ElementTree as et

# Saves the namespaces (xmlns) of the first XML for re-application to the output
et.register_namespace('gmd', "http://www.isotc211.org/2005/gmd")
et.register_namespace('gts', "http://www.isotc211.org/2005/gts")
et.register_namespace('gmx', "http://www.isotc211.org/2005/gmx")
et.register_namespace('xsi', "http://www.w3.org/2001/XMLSchema-instance")
et.register_namespace('gml', "http://www.opengis.net/gml")
et.register_namespace('gfc', "http://www.isotc211.org/2005/gfc")
et.register_namespace('gco', "http://www.isotc211.org/2005/gco")
et.register_namespace('xlink', "http://www.w3.org/1999/xlink")
et.register_namespace('geonet', "http://www.fao.org/geonetwork")


# Analyse the XML file
xml_produit = et.parse('first_xml.xml')
xml_general = et.parse('second_xml.xml')
 
# Obtain the root element of the first XML
root_produit = xml_produit.getroot()
print("La balise racine est :", root_produit)

# Force display of unused namespaces not displayed with et.register_namespace in the root tag
root_produit.set("xmlns:gts", "http://www.isotc211.org/2005/gts")
root_produit.set("xmlns:gmx", "http://www.isotc211.org/2005/gmx")
root_produit.set("xmlns:gml", "http://www.opengis.net/gml")
root_produit.set("xmlns:gfc", "http://www.isotc211.org/2005/gfc")
root_produit.set("xmlns:xlink", "http://www.w3.org/1999/xlink")
root_produit.set("xmlns:geonet", "http://www.fao.org/geonetwork")

# Obtain the root element of the second XML
root_general = xml_general.getroot()
print("La balise racine est :", root_general)

# Get the child tags of the second XML from the 2nd to the last one
sub_elements=root_general[1:]
print("Les balises enfant de la racine sont :", sub_elements)

for elements in sub_elements: 
    root_produit.append(elements) # Copies the child tags into the first XML
    #et.indent(elements[0], '\t', level=1) # An indentation test with et.indent (not working)
    xml_produit.write("infodump.xml", encoding='utf-8', xml_declaration=True, method='xml') # Write a new XML that combines the two

#EDIT
tree = ET.parse('infodump.xml')
ET.indent(tree, space="  ")
tree.write("infodump2.xml", encoding='utf-8', xml_declaration=True, method='xml')

What I get :

  <gmd:contact>
    <gmd:CI_ResponsibleParty>
      <gmd:organisationName>
        <gco:CharacterString>blabla</gco:CharacterString>
      </gmd:organisationName>
    </gmd:CI_ResponsibleParty>
  </gmd:contact>
  <isAvailableAsPaper>blabla</isAvailableAsPaper>
  <productKind>blabla</productKind>
  <visibility>blabla</visibility>
<domain>L</domain>
  <catalogueEntitySubType>Blabla</catalogueEntitySubType>
  <dataType1><dataType1>01_blabla</dataType1>
  <dataType2>01-01_blabla</dataType2>
  <dataType3>01-01-1_blabla</dataType3>
  <isPermanent>true</isPermanent>
  <pointOfContact>blabla</pointOfContact>
  <author>blabla</author>
  <authorCountry>blabla</authorCountry>
  <rawMetaDataProfile>blabla</rawMetaDataProfile>
</gmd:MD_Metadata>

What I would like to achieve :

  <gmd:contact>
    <gmd:CI_ResponsibleParty>
      <gmd:organisationName>
        <gco:CharacterString>blabla</gco:CharacterString>
      </gmd:organisationName>
    </gmd:CI_ResponsibleParty>
  </gmd:contact>
  <isAvailableAsPaper>blabla</isAvailableAsPaper>
  <productKind>blabla</productKind>
  <visibility>blabla</visibility>
  <domain>L</domain>
  <catalogueEntitySubType>Blabla</catalogueEntitySubType>
  <dataType1><dataType1>01_blabla</dataType1>
  <dataType2>01-01_blabla</dataType2>
  <dataType3>01-01-1_blabla</dataType3>
  <isPermanent>true</isPermanent>
  <pointOfContact>blabla</pointOfContact>
  <author>blabla</author>
  <authorCountry>blabla</authorCountry>
  <rawMetaDataProfile>blabla</rawMetaDataProfile>
</gmd:MD_Metadata>

I searched on the internet and I've tried things with et.indent but I can't get what I want.
Any help would be appreciated.


Solution

  • I use always capital ET for xml.etree.ElementTree. ET.indent() function should work (et I use for lxml only, but it should work equal). In your case, try:

    tree = et.ElementTree(xml_produit)
    et.indent(tree, space="  ")
    tree.write("infodump.xml", encoding='utf-8', xml_declaration=True, method='xml')