Search code examples
pythonxmlsqliteparsingsax

SAX Confusing Two Nodes With Same Name in XML


I have an XML that I need to parse in python using SAX.

Here is a small part of my XML :

<MovieRating>
  <Movie Id="1">
    <Title>Father Figures</Title>
    <Duration>01:53:00</Duration>
    <Description>Upon learning that their mother has been lying to them for years about their allegedly deceased father, two fraternal twin brothers hit the road in order to find him.</Description>
    <Release_Date>2017-12-22</Release_Date>
    <Image_URL>https://image.com/1.jpg</Image_URL>
    <Country>USA</Country>
    <Genres>
      <Genre Id="5">
        <Title>Comedy</Title>
        <Description>None</Description>
      </Genre>
    </Genres>
    </Movies>
</MovieRating>

I am trying to parse it and save it into a DB, in the following matter:

import sqlite3
import xml.sax 

class MoviesHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.sql_attr_name = None
        self.sql_attrs = dict()
        self.conn = None

    def startDocument(self):
        self.conn = sqlite3.connect('moviez_sax.db')
        c = self.conn.cursor()
        c.execute('DROP TABLE IF EXISTS MOVIE')
        c.execute('''
                    CREATE TABLE IF NOT EXISTS Movie (
                Id           INTEGER        NOT NULL,
                Title        VARCHAR (1000) NOT NULL,
                Duration     TIME           NOT NULL,
                Description  VARCHAR (5000),
                Release_Date DATE           NOT NULL,
                Image_URL    VARCHAR (1000),
                Country      VARCHAR (150), 
                PRIMARY KEY (Id)
            );''');

    def endDocument(self):
        self.conn.commit()
        self.conn.close()


    def startElement(self, xml_name, xml_attrs):
        #print("start element", xml_name)
        if xml_name.lower() == 'movierating':
            pass
        if xml_name.lower() == 'movie':
            self.sql_attr_name = None
            self.sql_attrs = {
                'Id' : '',
                'Title' : '',
                'Duration' : '',
                'Description' : '',
                'Release_Date' : '',
                'Image_URL' : '',
                'Country':''
            }
            self.sql_attrs['Id'] += xml_attrs['Id']
        elif xml_name.lower() in ['id', 'title', 'duration', 'description' , 'release_date', 'image_url','country']:
            self.sql_attr_name = xml_name
        else:
            pass

    def characters(self, text):
        if self.sql_attr_name is not None:
            self.sql_attrs[self.sql_attr_name] += text

    def endElement(self, xml_name):
        if xml_name.lower() == 'movie':
            c = self.conn.cursor()
            c.execute('INSERT INTO MOVIE(Id,Title,Duration,\
                       Description,Release_Date,Image_URL, Country) VALUES \
                       (?,?,?,?,?,?,?)',
                      (self.sql_attrs['Id'].strip(),
                       self.sql_attrs['Title'].strip(),
                       self.sql_attrs['Duration'].strip(),
                       self.sql_attrs['Description'].strip(),
                       self.sql_attrs['Release_Date'].strip(),
                       self.sql_attrs['Image_URL'].strip(),
                       self.sql_attrs['Country'].strip())) 

if __name__ == '__main__':
    parser = xml.sax.make_parser()
    parser.setContentHandler(MovieRatingHandler())
    parser.parse(open('movies.xml','r'))

My problem is, whenever i read self.sql_attrs['Title'].strip(), it is reading the TITLE node from both nodes : Movie and Genre

And it is saving the value of both concatenated. Like in that example, the value i'm getting for title is :

Father Figures \n Comedy

Is there a way to specify to SAX which node title but also which node path to read ? Because i have many nodes with same names but i wanna read "Movie" alone and "Genre" alone and save them in two different tables.

Thank you.


Solution

  • You need to record an event in case of both genre and movie. You will then use a stack/list to push the current node.

    You should maintain a node based dict in that case too. Below is just an update to your code, not the most efficient one, but shows how you can expand your example

    import sqlite3
    import xml.sax
    
    class MoviesHandler(xml.sax.ContentHandler):
        def __init__(self):
            self.sql_attr_name = None
            self.nodes = []
            self.sql_attrs = dict()
            self.conn = None
    
        def startDocument(self):
            self.conn = sqlite3.connect('moviez_sax.db')
            c = self.conn.cursor()
            c.execute('DROP TABLE IF EXISTS MOVIE')
            c.execute('''
                        CREATE TABLE IF NOT EXISTS Movie (
                    Id           INTEGER        NOT NULL,
                    Title        VARCHAR (1000) NOT NULL,
                    Duration     TIME           NOT NULL,
                    Description  VARCHAR (5000),
                    Release_Date DATE           NOT NULL,
                    Image_URL    VARCHAR (1000),
                    Country      VARCHAR (150), 
                    PRIMARY KEY (Id)
                );''');
    
        def endDocument(self):
            self.conn.commit()
            self.conn.close()
    
    
        def startElement(self, xml_name, xml_attrs):
            #print("start element", xml_name)
            if xml_name.lower() == 'movierating':
                pass
            if xml_name.lower() == 'genre':
                self.nodes.append(xml_name)
                self.sql_attrs[xml_name] = {
                    'Title': '',
                    'Description': '',
                }
            if xml_name.lower() == 'movie':
                self.nodes.append(xml_name)
                self.sql_attr_name = None
                self.sql_attrs[xml_name] = {
                    'Id' : '',
                    'Title' : '',
                    'Duration' : '',
                    'Description' : '',
                    'Release_Date' : '',
                    'Image_URL' : '',
                    'Country':''
                }
                self.sql_attrs[xml_name]['Id'] += xml_attrs['Id']
            elif xml_name.lower() in ['id', 'title', 'duration', 'description' , 'release_date', 'image_url','country']:
                self.sql_attr_name = xml_name
            else:
                self.sql_attr_name = None
                pass
    
        def characters(self, text):
            if self.sql_attr_name is None or len(self.nodes) == 0:
                return
    
            if self.sql_attrs[self.nodes[-1]] is not None:
                self.sql_attrs[self.nodes[-1]][self.sql_attr_name] += text
    
        def endElement(self, xml_name):
            if xml_name.lower() in ['movie', 'genre']:
                self.nodes.pop()
    
            if xml_name.lower() == 'movie':
                c = self.conn.cursor()
                c.execute('INSERT INTO MOVIE(Id,Title,Duration,\
                           Description,Release_Date,Image_URL, Country) VALUES \
                           (?,?,?,?,?,?,?)',
                          (self.sql_attrs[xml_name]['Id'].strip(),
                           self.sql_attrs[xml_name]['Title'].strip(),
                           self.sql_attrs[xml_name]['Duration'].strip(),
                           self.sql_attrs[xml_name]['Description'].strip(),
                           self.sql_attrs[xml_name]['Release_Date'].strip(),
                           self.sql_attrs[xml_name]['Image_URL'].strip(),
                           self.sql_attrs[xml_name]['Country'].strip()))
    
    
    if __name__ == '__main__':
        parser = xml.sax.make_parser()
        parser.setContentHandler(MoviesHandler())
        parser.parse(open('movies.xml','r'))
    

    And as you can see the debug session the titles are correct

    Working