Search code examples
pythonexcelxml

Need help understanding how to extract multiple instances of the same type from an xml file to export to Excel


I have been given a python file that extracts relevant data from a xml file to be exported to an Excel file. I have it working for the most part. The xml file is one set of data to be extracted into one row in Excel.

The issue I do not know how to resolve is how to extract (potentially) multiple values of the same type. what I wrote only extract one instance.

Here is an example of what I have in the python code:

 elif child.tag.endswith("titleInfo"):
                if child.attrib.get("type") == "alternative":
                    alternative_title = child.find("mods:title", namespaces).text
                    record_data["Alternative Title"] = alternative_title

Here is the xml relevant file info (other xml files that I need to process may have 0, 1, or more alternative titles):

 <mods:titleInfo type="alternative">
  <mods:title>Ziemer diary</mods:title>
 </mods:titleInfo>
 <mods:titleInfo type="alternative">
  <mods:title>F.G. Ziemer Tagebuch</mods:title>
 </mods:titleInfo>

**When I run it only the second instance is recorded **

There are several places in these xml files that will need the same solution.

Thanks for the help!

Full python code:

# MODS/METS XML to XLSX Crosswalk

import xml.etree.ElementTree as ET
import pandas as pd


def parse_xml(xml_file):
    # Create an ElementTree object and parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Define namespaces
    namespaces = {
        "mods": "http://www.loc.gov/mods/v3",
        "mets": "http://www.loc.gov/METS/"
    }

    # Extract the necessary data from the XML structure
    data = []
    for dmdSec in root.iterfind(".//mets:dmdSec", namespaces):
        record_data = {}
        mods = dmdSec.find(".//mods:mods", namespaces)

        title = mods.find(".//mods:title", namespaces)
        if title is not None:
            record_data["Title"] = title.text


        for child in mods:
            if child.tag.endswith("title"):
                record_data["Title"] = child.text
            elif child.tag.endswith("abstract"):
                record_data["Abstract"] = child.text
            elif child.tag.endswith("identifier"):
                record_data["Identifier"] = child.text
            elif child.tag.endswith("location"):
                file_link = child.find("mods:url", namespaces).text
                record_data["Path"] = file_link
            elif child.tag.endswith("typeOfResource"):
                record_data["Type of Resource"] = child.text
            elif child.tag.endswith("genre"):
                record_data["Genre"] = child.text
            elif child.tag.endswith("subject"):
                if child.find("mods:hierarchicalGeographic", namespaces) is not None:
                    hier_geo = child.find("mods:hierarchicalGeographic", namespaces)
                    country = hier_geo.find(".//mods:country", namespaces)
                    state = hier_geo.find(".//mods:state", namespaces)
                    county = hier_geo.find(".//mods:county", namespaces)
                    city = hier_geo.find(".//mods:city", namespaces)
                    if country is not None:
                        record_data["Country"] = country.text
                    if state is not None:
                        record_data["State"] = state.text
                    if county is not None:
                        record_data["County"] = county.text
                    if city is not None:
                        record_data["City"] = city.text
                # if child.find("mods:geographic", namespaces) is not None:
                #     geographic = child.find("mods:geographic", namespaces).text
                #     record_data.setdefault("Geographic", []).append(geographic)
                # elif child.find("mods:temporal", namespaces) is not None:
                #     temporal = child.find("mods:temporal", namespaces).text
                #     record_data.setdefault("Temporal", []).append(temporal)
                # elif child.find("mods:topic", namespaces) is not None:
                #     topic = child.find("mods:topic", namespaces).text
                #     record_data.setdefault("Subject", []).append(topic)
            elif child.tag.endswith("originInfo"):
                for info in child:
                    if info.tag.endswith("place"):
                        record_data["Place"] = info.find("mods:placeTerm", namespaces).text
                    elif info.tag.endswith("publisher"):
                        record_data["Publisher"] = info.text
                    elif info.tag.endswith("dateIssued"):
                        record_data["Date Issued"] = info.text
                    elif info.tag.endswith("dateCaptured"):
                        record_data["Date Captured"] = info.text
            # elif child.tag.endswith("part"):
            #     part_number = child.find(".//mods:number", namespaces).text
            #     record_data["Date Published"] = part_number
            elif child.tag.endswith("titleInfo"):
                if child.attrib.get("type") == "alternative":
                    alternative_title = child.find("mods:title", namespaces).text
                    record_data["Alternative Title"].append(alternative_title)

            elif child.tag.endswith("name"):
                if child.attrib.get("type") == "corporate":
                    corporate_name = child.find("mods:namePart", namespaces).text
                    record_data["Corporate Name"] = corporate_name
                elif child.attrib.get("type") == "personal":
                    author_name = child.find("mods:namePart", namespaces).text
                    record_data["Author"] = author_name

            # Additional fields from the provided XML structure
            elif child.tag.endswith("language"):
                language_term = child.find("mods:languageTerm", namespaces).text
                record_data["Language"] = language_term
            elif child.tag.endswith("physicalDescription"):
                form = child.find(".//mods:form", namespaces).text
                extent = child.find(".//mods:extent", namespaces).text
                record_data["Form"] = form
                record_data["Dimensions"] = extent
            elif child.tag.endswith("accessCondition"):
                record_data["Access Condition"] = child.text

        data.append(record_data)

    return data


def xml_to_excel(xml_file, excel_file):
    # Parse XML file
    data = parse_xml(xml_file)

    # Convert data to a DataFrame
    df = pd.DataFrame(data)

    # Reorder columns to have "Title" as the first column
    if "Title" in df.columns:
        columns = df.columns.tolist()
        columns.remove("Title")
        columns = ["Title"] + columns
        df = df[columns]

    # Export to Excel
    df.to_excel(excel_file, index=False)

    print(f"Data exported to {excel_file} successfully!")


# File Management
xml_file = "/Users/spaz/Documents/David/Data Scientist/Rescarta Stuff/metadata copy.xml"  # Replace with your XML file path
excel_file = "/Users/spaz/Desktop/Outputs/output.xlsx"  # Replace with desired output Excel file path

xml_to_excel(xml_file, excel_file)

Here is the xml file:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" LABEL="Tagebuch (Ziemer diary)" OBJID="witrpl00/th190530/00000001" TYPE="ResCarta Monograph Metadata v7.0" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/mets.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-4.xsd http://www.loc.gov/mix/ http://www.loc.gov/standards/mix/mix02/mix02.xsd" xmlns:mix="http://www.loc.gov/mix/" xmlns:mods="http://www.loc.gov/mods/v3">
 <mets:metsHdr CREATEDATE="2019-08-14T08:11:43.647-05:00" RECORDSTATUS="COMPLETE">
  <mets:agent ROLE="CREATOR" TYPE="ORGANIZATION">
   <mets:name>ResCarta Tools</mets:name>
   <mets:note>Generated by the ResCarta Data Conversion Tool v7.0.2</mets:note>
  </mets:agent>
 </mets:metsHdr>
 <mets:dmdSec ID="DMD1">
  <mets:mdWrap MDTYPE="MODS" MIMETYPE="text/xml">
   <mets:xmlData>
    <mods:mods>
     <mods:titleInfo>
      <mods:title>Tagebuch (Ziemer diary)</mods:title>
     </mods:titleInfo>
     <mods:titleInfo type="alternative">
      <mods:title>Ziemer diary</mods:title>
     </mods:titleInfo>
     <mods:titleInfo type="alternative">
      <mods:title>F.G. Ziemer Tagebuch</mods:title>
     </mods:titleInfo>
     <mods:name type="corporate">
      <mods:namePart>Theresa Public Library</mods:namePart>
      <mods:role>
       <mods:roleTerm authority="marcrelator" type="code">own</mods:roleTerm>
       <mods:roleTerm authority="marcrelator" type="text">Owner</mods:roleTerm>
      </mods:role>
     </mods:name>
     <mods:name type="corporate">
      <mods:namePart>Theresa Historical Society</mods:namePart>
      <mods:role>
       <mods:roleTerm authority="marcrelator" type="code">own</mods:roleTerm>
       <mods:roleTerm authority="marcrelator" type="text">Owner</mods:roleTerm>
      </mods:role>
     </mods:name>
     <mods:name type="personal">
      <mods:namePart>Ziemer, Friedrich Gorg Emanuel</mods:namePart>
      <mods:role>
       <mods:roleTerm authority="marcrelator" type="code">aut</mods:roleTerm>
       <mods:roleTerm authority="marcrelator" type="text">Author</mods:roleTerm>
      </mods:role>
     </mods:name>
     <mods:typeOfResource>text</mods:typeOfResource>
     <mods:genre authority="marcgt">book</mods:genre>
     <mods:originInfo>
      <mods:dateIssued encoding="iso8601" qualifier="questionable">1930-06-21</mods:dateIssued>
      <mods:dateCaptured encoding="iso8601">2018-09-20</mods:dateCaptured>
      <mods:issuance>monographic</mods:issuance>
     </mods:originInfo>
     <mods:language>
      <mods:languageTerm authority="iso639-2b" type="code">eng</mods:languageTerm>
     </mods:language>
     <mods:language>
      <mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
     </mods:language>
     <mods:physicalDescription>
      <mods:form authority="marcform">electronic</mods:form>
      <mods:extent>8.18 x 11.61 in.</mods:extent>
      <mods:digitalOrigin>reformatted digital</mods:digitalOrigin>
     </mods:physicalDescription>
     <mods:subject>
      <mods:hierarchicalGeographic>
       <mods:country>United States</mods:country>
       <mods:state>Wisconsin</mods:state>
       <mods:county>Dodge</mods:county>
       <mods:city>Theresa</mods:city>
      </mods:hierarchicalGeographic>
     </mods:subject>
     <mods:subject authority="lctgm">
      <mods:topic>Farm life</mods:topic>
     </mods:subject>
     <mods:subject authority="lctgm">
      <mods:topic>Farmers</mods:topic>
     </mods:subject>
     <mods:subject authority="lctgm">
      <mods:topic>Almanacs</mods:topic>
     </mods:subject>
     <mods:identifier type="local">witrpl00/th190530/00000001</mods:identifier>
     <mods:location>
      <mods:url>witrpl00/th190530/00000001</mods:url>
     </mods:location>
     <mods:accessCondition type="use and reproduction">http://rightsstatements.org/vocab/InC/1.0/</mods:accessCondition>
     <mods:part order="1">
      <mods:extent unit="pages">
       <mods:start>1</mods:start>
       <mods:end>248</mods:end>
      </mods:extent>
     </mods:part>
    </mods:mods>
   </mets:xmlData>
  </mets:mdWrap>
 </mets:dmdSec>
 <mets:amdSec ID="AMD1">

What follows this xml code is info about each .tiff image that each represents a page in the document (about 245 pages). Each page has xml code like the following. But it was not included here due to the length. Plus I do not think it is needed for this data transfer. According to my basic instructions I just need to convert all the .tiff files to .jpeg (I did that with some tweaking of a python file I was given) then I need to export the xml file relevant data into a one row Excel file. So to answer your other question, we would need a separate column for each data type and if multiple instances of a type exits, they can either all have their own column or otherwise combine the data into one cell for that column. No second rows.

   <mets:file ADMID="AMD1" CHECKSUM="f5aec7b842ec762e6cbf07a6b6bb23070e7c96dd" CHECKSUMTYPE="SHA-1" ID="FID11" MIMETYPE="image/tiff" SIZE="11393570">
    <mets:FLocat LOCTYPE="URL" xlink:href="file:///./00000011.tif" xlink:type="simple"/>
   </mets:file>

  <mets:div LABEL="11" ORDER="11" TYPE="page" xlink:label="PPG11">
    <mets:fptr FILEID="FID11"/>
   </mets:div>

Solution

  • Looks like you overwrite your dictionary with the second values. Dictionaries doesn’t allow duplicate keys, what’s the case here, compare this post.

    You can search with xpath, if you use xml.etree.ElementTree , e.g.:

    findall() gives you a list.

    for alternative_title in root.findall(".//*[@type='alternative']/mods:title", namespaces):
        print(alternative_title.text)
    

    Output:

    Ziemer diary
    F.G. Ziemer Tagebuch
    

    Option2:

    from collections import defaultdict
    
    record_data = defaultdict(list)
    for child in root.iter():
        if child.tag.endswith("titleInfo"):
            if child.attrib.get("type") == "alternative":
                alternative_title = child.find("mods:title", namespaces).text
                record_data["Alternative Title"].append(alternative_title)
    
    for key, val in record_data.items():
        print(key, val)
    # Alternative Title ['Ziemer diary', 'F.G. Ziemer Tagebuch']
    

    Than you can use pandas to write to Excel:

    import pandas as pd
    
    df = pd.DataFrame.from_dict(record_data)
    print(df)
    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_excel.html
    # if you have pip install openpyxl, e.g.
    # df.to_excel('output.xlsx')
    

    Output:

          Alternative Title
    0          Ziemer diary
    1  F.G. Ziemer Tagebuch
    

    Suggestion Cut you xml in pieces and collect the data:

    import xml.etree.ElementTree as ET
    import pandas as pd
    
    class MDWrap:
        def __init__(self, ns, tree):
            self.data = []
            self.columns = ["title", "Alternative titles"]
            self.root = tree
            self.ns = ns
    
            # Check, if mdWrap available
            try:
                self.mdwrap = self.root.find(".//mets:mdWrap", namespaces=ns)
            except:
                print("No nmWrap node available")  
    
            # parse title and alternative titles
            self.titleInfo = self.root.findall(".//mods:titleInfo", namespaces=ns)
            self.titles(self.ns, self.titleInfo)
            self.data.append(self.title)
            self.data.append(self.alt_titles)
    
            # Parse your other stuff here ...
            # ....
    
    
        def titles(self, ns, title_nodes):
            """ self.title -> string, self.alt_title -> list """
            self.alt_titles = []
            for alt_t in title_nodes:
                if alt_t.get('type'):
                    self.alt_titles.append(alt_t.find('./*').text)
                    #print(alt_t.get('type'), alt_t.find('./*').text)
                if len(alt_t.attrib) == 0:
                    self.title = alt_t.find('./*').text
                    #print("main", self.title)
    
        def write_excel(self, data, col, file_out):
            df = pd.DataFrame([data], columns=col)
            print(df)
            # pip install openpyxl is necessary, if you dont have this
            df.to_excel(file_out)
                       
                    
    if __name__ == "__main__":
        xml_file = "lib.xml"
        xlsx_out = "dmdS_data.xlsx"
        
        nsmap = {"mets": "http://www.loc.gov/METS/", "mods":"http://www.loc.gov/mods/v3"}
    
        for event, node in ET.iterparse(xml_file, events=['end']):
            #print(node.tag)
            if node.tag == f"{{{nsmap['mets']}}}dmdSec":
                dmdS = node
                mdW = MDWrap(nsmap, dmdS)
                node.clear()
            
        # print(mdW.title)
        # print(mdW.alt_titles)
        # print(mdW.data)
    
        # Write Excel if all data collected
        mdW.write_excel(mdW.data, mdW.columns, xlsx_out)
    

    Output plus Excel:

                         title                    Alternative titles
    0  Tagebuch (Ziemer diary)  [Ziemer diary, F.G. Ziemer Tagebuch]