I have an XML that I need to parse in python using SAX.
Here is a small part of my XML :
<MovieRating>
<Movie Id="1">
<Title>Father Figures</Title>
<Duration>01:53:00</Duration>
<Description>Upon learning that their mother has been lying to them for years about their allegedly deceased father, two fraternal twin brothers hit the road in order to find him.</Description>
<Release_Date>2017-12-22</Release_Date>
<Image_URL>https://image.com/1.jpg</Image_URL>
<Country>USA</Country>
<Genres>
<Genre Id="5">
<Title>Comedy</Title>
<Description>None</Description>
</Genre>
</Genres>
</Movies>
</MovieRating>
I am trying to parse it and save it into a DB, in the following matter:
import sqlite3
import xml.sax
class MoviesHandler(xml.sax.ContentHandler):
def __init__(self):
self.sql_attr_name = None
self.sql_attrs = dict()
self.conn = None
def startDocument(self):
self.conn = sqlite3.connect('moviez_sax.db')
c = self.conn.cursor()
c.execute('DROP TABLE IF EXISTS MOVIE')
c.execute('''
CREATE TABLE IF NOT EXISTS Movie (
Id INTEGER NOT NULL,
Title VARCHAR (1000) NOT NULL,
Duration TIME NOT NULL,
Description VARCHAR (5000),
Release_Date DATE NOT NULL,
Image_URL VARCHAR (1000),
Country VARCHAR (150),
PRIMARY KEY (Id)
);''');
def endDocument(self):
self.conn.commit()
self.conn.close()
def startElement(self, xml_name, xml_attrs):
#print("start element", xml_name)
if xml_name.lower() == 'movierating':
pass
if xml_name.lower() == 'movie':
self.sql_attr_name = None
self.sql_attrs = {
'Id' : '',
'Title' : '',
'Duration' : '',
'Description' : '',
'Release_Date' : '',
'Image_URL' : '',
'Country':''
}
self.sql_attrs['Id'] += xml_attrs['Id']
elif xml_name.lower() in ['id', 'title', 'duration', 'description' , 'release_date', 'image_url','country']:
self.sql_attr_name = xml_name
else:
pass
def characters(self, text):
if self.sql_attr_name is not None:
self.sql_attrs[self.sql_attr_name] += text
def endElement(self, xml_name):
if xml_name.lower() == 'movie':
c = self.conn.cursor()
c.execute('INSERT INTO MOVIE(Id,Title,Duration,\
Description,Release_Date,Image_URL, Country) VALUES \
(?,?,?,?,?,?,?)',
(self.sql_attrs['Id'].strip(),
self.sql_attrs['Title'].strip(),
self.sql_attrs['Duration'].strip(),
self.sql_attrs['Description'].strip(),
self.sql_attrs['Release_Date'].strip(),
self.sql_attrs['Image_URL'].strip(),
self.sql_attrs['Country'].strip()))
if __name__ == '__main__':
parser = xml.sax.make_parser()
parser.setContentHandler(MovieRatingHandler())
parser.parse(open('movies.xml','r'))
My problem is, whenever i read self.sql_attrs['Title'].strip()
, it is reading the TITLE node from both nodes : Movie and Genre
And it is saving the value of both concatenated. Like in that example, the value i'm getting for title is :
Father Figures \n Comedy
Is there a way to specify to SAX which node title but also which node path to read ? Because i have many nodes with same names but i wanna read "Movie" alone and "Genre" alone and save them in two different tables.
Thank you.
You need to record an event in case of both genre
and movie
. You will then use a stack/list to push the current node.
You should maintain a node based dict in that case too. Below is just an update to your code, not the most efficient one, but shows how you can expand your example
import sqlite3
import xml.sax
class MoviesHandler(xml.sax.ContentHandler):
def __init__(self):
self.sql_attr_name = None
self.nodes = []
self.sql_attrs = dict()
self.conn = None
def startDocument(self):
self.conn = sqlite3.connect('moviez_sax.db')
c = self.conn.cursor()
c.execute('DROP TABLE IF EXISTS MOVIE')
c.execute('''
CREATE TABLE IF NOT EXISTS Movie (
Id INTEGER NOT NULL,
Title VARCHAR (1000) NOT NULL,
Duration TIME NOT NULL,
Description VARCHAR (5000),
Release_Date DATE NOT NULL,
Image_URL VARCHAR (1000),
Country VARCHAR (150),
PRIMARY KEY (Id)
);''');
def endDocument(self):
self.conn.commit()
self.conn.close()
def startElement(self, xml_name, xml_attrs):
#print("start element", xml_name)
if xml_name.lower() == 'movierating':
pass
if xml_name.lower() == 'genre':
self.nodes.append(xml_name)
self.sql_attrs[xml_name] = {
'Title': '',
'Description': '',
}
if xml_name.lower() == 'movie':
self.nodes.append(xml_name)
self.sql_attr_name = None
self.sql_attrs[xml_name] = {
'Id' : '',
'Title' : '',
'Duration' : '',
'Description' : '',
'Release_Date' : '',
'Image_URL' : '',
'Country':''
}
self.sql_attrs[xml_name]['Id'] += xml_attrs['Id']
elif xml_name.lower() in ['id', 'title', 'duration', 'description' , 'release_date', 'image_url','country']:
self.sql_attr_name = xml_name
else:
self.sql_attr_name = None
pass
def characters(self, text):
if self.sql_attr_name is None or len(self.nodes) == 0:
return
if self.sql_attrs[self.nodes[-1]] is not None:
self.sql_attrs[self.nodes[-1]][self.sql_attr_name] += text
def endElement(self, xml_name):
if xml_name.lower() in ['movie', 'genre']:
self.nodes.pop()
if xml_name.lower() == 'movie':
c = self.conn.cursor()
c.execute('INSERT INTO MOVIE(Id,Title,Duration,\
Description,Release_Date,Image_URL, Country) VALUES \
(?,?,?,?,?,?,?)',
(self.sql_attrs[xml_name]['Id'].strip(),
self.sql_attrs[xml_name]['Title'].strip(),
self.sql_attrs[xml_name]['Duration'].strip(),
self.sql_attrs[xml_name]['Description'].strip(),
self.sql_attrs[xml_name]['Release_Date'].strip(),
self.sql_attrs[xml_name]['Image_URL'].strip(),
self.sql_attrs[xml_name]['Country'].strip()))
if __name__ == '__main__':
parser = xml.sax.make_parser()
parser.setContentHandler(MoviesHandler())
parser.parse(open('movies.xml','r'))
And as you can see the debug session the titles are correct