I have been given a python file that extracts relevant data from a xml file to be exported to an Excel file. I have it working for the most part. The xml file is one set of data to be extracted into one row in Excel.
The issue I do not know how to resolve is how to extract (potentially) multiple values of the same type. what I wrote only extract one instance.
Here is an example of what I have in the python code:
elif child.tag.endswith("titleInfo"):
if child.attrib.get("type") == "alternative":
alternative_title = child.find("mods:title", namespaces).text
record_data["Alternative Title"] = alternative_title
Here is the xml relevant file info (other xml files that I need to process may have 0, 1, or more alternative titles):
<mods:titleInfo type="alternative">
<mods:title>Ziemer diary</mods:title>
</mods:titleInfo>
<mods:titleInfo type="alternative">
<mods:title>F.G. Ziemer Tagebuch</mods:title>
</mods:titleInfo>
**When I run it only the second instance is recorded **
There are several places in these xml files that will need the same solution.
Thanks for the help!
Full python code:
# MODS/METS XML to XLSX Crosswalk
import xml.etree.ElementTree as ET
import pandas as pd
def parse_xml(xml_file):
# Create an ElementTree object and parse the XML file
tree = ET.parse(xml_file)
root = tree.getroot()
# Define namespaces
namespaces = {
"mods": "http://www.loc.gov/mods/v3",
"mets": "http://www.loc.gov/METS/"
}
# Extract the necessary data from the XML structure
data = []
for dmdSec in root.iterfind(".//mets:dmdSec", namespaces):
record_data = {}
mods = dmdSec.find(".//mods:mods", namespaces)
title = mods.find(".//mods:title", namespaces)
if title is not None:
record_data["Title"] = title.text
for child in mods:
if child.tag.endswith("title"):
record_data["Title"] = child.text
elif child.tag.endswith("abstract"):
record_data["Abstract"] = child.text
elif child.tag.endswith("identifier"):
record_data["Identifier"] = child.text
elif child.tag.endswith("location"):
file_link = child.find("mods:url", namespaces).text
record_data["Path"] = file_link
elif child.tag.endswith("typeOfResource"):
record_data["Type of Resource"] = child.text
elif child.tag.endswith("genre"):
record_data["Genre"] = child.text
elif child.tag.endswith("subject"):
if child.find("mods:hierarchicalGeographic", namespaces) is not None:
hier_geo = child.find("mods:hierarchicalGeographic", namespaces)
country = hier_geo.find(".//mods:country", namespaces)
state = hier_geo.find(".//mods:state", namespaces)
county = hier_geo.find(".//mods:county", namespaces)
city = hier_geo.find(".//mods:city", namespaces)
if country is not None:
record_data["Country"] = country.text
if state is not None:
record_data["State"] = state.text
if county is not None:
record_data["County"] = county.text
if city is not None:
record_data["City"] = city.text
# if child.find("mods:geographic", namespaces) is not None:
# geographic = child.find("mods:geographic", namespaces).text
# record_data.setdefault("Geographic", []).append(geographic)
# elif child.find("mods:temporal", namespaces) is not None:
# temporal = child.find("mods:temporal", namespaces).text
# record_data.setdefault("Temporal", []).append(temporal)
# elif child.find("mods:topic", namespaces) is not None:
# topic = child.find("mods:topic", namespaces).text
# record_data.setdefault("Subject", []).append(topic)
elif child.tag.endswith("originInfo"):
for info in child:
if info.tag.endswith("place"):
record_data["Place"] = info.find("mods:placeTerm", namespaces).text
elif info.tag.endswith("publisher"):
record_data["Publisher"] = info.text
elif info.tag.endswith("dateIssued"):
record_data["Date Issued"] = info.text
elif info.tag.endswith("dateCaptured"):
record_data["Date Captured"] = info.text
# elif child.tag.endswith("part"):
# part_number = child.find(".//mods:number", namespaces).text
# record_data["Date Published"] = part_number
elif child.tag.endswith("titleInfo"):
if child.attrib.get("type") == "alternative":
alternative_title = child.find("mods:title", namespaces).text
record_data["Alternative Title"].append(alternative_title)
elif child.tag.endswith("name"):
if child.attrib.get("type") == "corporate":
corporate_name = child.find("mods:namePart", namespaces).text
record_data["Corporate Name"] = corporate_name
elif child.attrib.get("type") == "personal":
author_name = child.find("mods:namePart", namespaces).text
record_data["Author"] = author_name
# Additional fields from the provided XML structure
elif child.tag.endswith("language"):
language_term = child.find("mods:languageTerm", namespaces).text
record_data["Language"] = language_term
elif child.tag.endswith("physicalDescription"):
form = child.find(".//mods:form", namespaces).text
extent = child.find(".//mods:extent", namespaces).text
record_data["Form"] = form
record_data["Dimensions"] = extent
elif child.tag.endswith("accessCondition"):
record_data["Access Condition"] = child.text
data.append(record_data)
return data
def xml_to_excel(xml_file, excel_file):
# Parse XML file
data = parse_xml(xml_file)
# Convert data to a DataFrame
df = pd.DataFrame(data)
# Reorder columns to have "Title" as the first column
if "Title" in df.columns:
columns = df.columns.tolist()
columns.remove("Title")
columns = ["Title"] + columns
df = df[columns]
# Export to Excel
df.to_excel(excel_file, index=False)
print(f"Data exported to {excel_file} successfully!")
# File Management
xml_file = "/Users/spaz/Documents/David/Data Scientist/Rescarta Stuff/metadata copy.xml" # Replace with your XML file path
excel_file = "/Users/spaz/Desktop/Outputs/output.xlsx" # Replace with desired output Excel file path
xml_to_excel(xml_file, excel_file)
Here is the xml file:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" LABEL="Tagebuch (Ziemer diary)" OBJID="witrpl00/th190530/00000001" TYPE="ResCarta Monograph Metadata v7.0" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/mets.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-4.xsd http://www.loc.gov/mix/ http://www.loc.gov/standards/mix/mix02/mix02.xsd" xmlns:mix="http://www.loc.gov/mix/" xmlns:mods="http://www.loc.gov/mods/v3">
<mets:metsHdr CREATEDATE="2019-08-14T08:11:43.647-05:00" RECORDSTATUS="COMPLETE">
<mets:agent ROLE="CREATOR" TYPE="ORGANIZATION">
<mets:name>ResCarta Tools</mets:name>
<mets:note>Generated by the ResCarta Data Conversion Tool v7.0.2</mets:note>
</mets:agent>
</mets:metsHdr>
<mets:dmdSec ID="DMD1">
<mets:mdWrap MDTYPE="MODS" MIMETYPE="text/xml">
<mets:xmlData>
<mods:mods>
<mods:titleInfo>
<mods:title>Tagebuch (Ziemer diary)</mods:title>
</mods:titleInfo>
<mods:titleInfo type="alternative">
<mods:title>Ziemer diary</mods:title>
</mods:titleInfo>
<mods:titleInfo type="alternative">
<mods:title>F.G. Ziemer Tagebuch</mods:title>
</mods:titleInfo>
<mods:name type="corporate">
<mods:namePart>Theresa Public Library</mods:namePart>
<mods:role>
<mods:roleTerm authority="marcrelator" type="code">own</mods:roleTerm>
<mods:roleTerm authority="marcrelator" type="text">Owner</mods:roleTerm>
</mods:role>
</mods:name>
<mods:name type="corporate">
<mods:namePart>Theresa Historical Society</mods:namePart>
<mods:role>
<mods:roleTerm authority="marcrelator" type="code">own</mods:roleTerm>
<mods:roleTerm authority="marcrelator" type="text">Owner</mods:roleTerm>
</mods:role>
</mods:name>
<mods:name type="personal">
<mods:namePart>Ziemer, Friedrich Gorg Emanuel</mods:namePart>
<mods:role>
<mods:roleTerm authority="marcrelator" type="code">aut</mods:roleTerm>
<mods:roleTerm authority="marcrelator" type="text">Author</mods:roleTerm>
</mods:role>
</mods:name>
<mods:typeOfResource>text</mods:typeOfResource>
<mods:genre authority="marcgt">book</mods:genre>
<mods:originInfo>
<mods:dateIssued encoding="iso8601" qualifier="questionable">1930-06-21</mods:dateIssued>
<mods:dateCaptured encoding="iso8601">2018-09-20</mods:dateCaptured>
<mods:issuance>monographic</mods:issuance>
</mods:originInfo>
<mods:language>
<mods:languageTerm authority="iso639-2b" type="code">eng</mods:languageTerm>
</mods:language>
<mods:language>
<mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
</mods:language>
<mods:physicalDescription>
<mods:form authority="marcform">electronic</mods:form>
<mods:extent>8.18 x 11.61 in.</mods:extent>
<mods:digitalOrigin>reformatted digital</mods:digitalOrigin>
</mods:physicalDescription>
<mods:subject>
<mods:hierarchicalGeographic>
<mods:country>United States</mods:country>
<mods:state>Wisconsin</mods:state>
<mods:county>Dodge</mods:county>
<mods:city>Theresa</mods:city>
</mods:hierarchicalGeographic>
</mods:subject>
<mods:subject authority="lctgm">
<mods:topic>Farm life</mods:topic>
</mods:subject>
<mods:subject authority="lctgm">
<mods:topic>Farmers</mods:topic>
</mods:subject>
<mods:subject authority="lctgm">
<mods:topic>Almanacs</mods:topic>
</mods:subject>
<mods:identifier type="local">witrpl00/th190530/00000001</mods:identifier>
<mods:location>
<mods:url>witrpl00/th190530/00000001</mods:url>
</mods:location>
<mods:accessCondition type="use and reproduction">http://rightsstatements.org/vocab/InC/1.0/</mods:accessCondition>
<mods:part order="1">
<mods:extent unit="pages">
<mods:start>1</mods:start>
<mods:end>248</mods:end>
</mods:extent>
</mods:part>
</mods:mods>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:amdSec ID="AMD1">
What follows this xml code is info about each .tiff image that each represents a page in the document (about 245 pages). Each page has xml code like the following. But it was not included here due to the length. Plus I do not think it is needed for this data transfer. According to my basic instructions I just need to convert all the .tiff files to .jpeg (I did that with some tweaking of a python file I was given) then I need to export the xml file relevant data into a one row Excel file. So to answer your other question, we would need a separate column for each data type and if multiple instances of a type exits, they can either all have their own column or otherwise combine the data into one cell for that column. No second rows.
<mets:file ADMID="AMD1" CHECKSUM="f5aec7b842ec762e6cbf07a6b6bb23070e7c96dd" CHECKSUMTYPE="SHA-1" ID="FID11" MIMETYPE="image/tiff" SIZE="11393570">
<mets:FLocat LOCTYPE="URL" xlink:href="file:///./00000011.tif" xlink:type="simple"/>
</mets:file>
<mets:div LABEL="11" ORDER="11" TYPE="page" xlink:label="PPG11">
<mets:fptr FILEID="FID11"/>
</mets:div>
Looks like you overwrite your dictionary with the second values. Dictionaries doesn’t allow duplicate keys, what’s the case here, compare this post.
You can search with xpath, if you use xml.etree.ElementTree
, e.g.:
findall()
gives you a list.
for alternative_title in root.findall(".//*[@type='alternative']/mods:title", namespaces):
print(alternative_title.text)
Output:
Ziemer diary
F.G. Ziemer Tagebuch
Option2:
from collections import defaultdict
record_data = defaultdict(list)
for child in root.iter():
if child.tag.endswith("titleInfo"):
if child.attrib.get("type") == "alternative":
alternative_title = child.find("mods:title", namespaces).text
record_data["Alternative Title"].append(alternative_title)
for key, val in record_data.items():
print(key, val)
# Alternative Title ['Ziemer diary', 'F.G. Ziemer Tagebuch']
Than you can use pandas to write to Excel:
import pandas as pd
df = pd.DataFrame.from_dict(record_data)
print(df)
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_excel.html
# if you have pip install openpyxl, e.g.
# df.to_excel('output.xlsx')
Output:
Alternative Title
0 Ziemer diary
1 F.G. Ziemer Tagebuch
Suggestion Cut you xml in pieces and collect the data:
import xml.etree.ElementTree as ET
import pandas as pd
class MDWrap:
def __init__(self, ns, tree):
self.data = []
self.columns = ["title", "Alternative titles"]
self.root = tree
self.ns = ns
# Check, if mdWrap available
try:
self.mdwrap = self.root.find(".//mets:mdWrap", namespaces=ns)
except:
print("No nmWrap node available")
# parse title and alternative titles
self.titleInfo = self.root.findall(".//mods:titleInfo", namespaces=ns)
self.titles(self.ns, self.titleInfo)
self.data.append(self.title)
self.data.append(self.alt_titles)
# Parse your other stuff here ...
# ....
def titles(self, ns, title_nodes):
""" self.title -> string, self.alt_title -> list """
self.alt_titles = []
for alt_t in title_nodes:
if alt_t.get('type'):
self.alt_titles.append(alt_t.find('./*').text)
#print(alt_t.get('type'), alt_t.find('./*').text)
if len(alt_t.attrib) == 0:
self.title = alt_t.find('./*').text
#print("main", self.title)
def write_excel(self, data, col, file_out):
df = pd.DataFrame([data], columns=col)
print(df)
# pip install openpyxl is necessary, if you dont have this
df.to_excel(file_out)
if __name__ == "__main__":
xml_file = "lib.xml"
xlsx_out = "dmdS_data.xlsx"
nsmap = {"mets": "http://www.loc.gov/METS/", "mods":"http://www.loc.gov/mods/v3"}
for event, node in ET.iterparse(xml_file, events=['end']):
#print(node.tag)
if node.tag == f"{{{nsmap['mets']}}}dmdSec":
dmdS = node
mdW = MDWrap(nsmap, dmdS)
node.clear()
# print(mdW.title)
# print(mdW.alt_titles)
# print(mdW.data)
# Write Excel if all data collected
mdW.write_excel(mdW.data, mdW.columns, xlsx_out)
Output plus Excel:
title Alternative titles
0 Tagebuch (Ziemer diary) [Ziemer diary, F.G. Ziemer Tagebuch]