Search code examples
pythonxmlqt5pyqt5qxmlstreamreader

The reading loop of QXmlReader for PyQt5 does not return the expected data


I'd like to make an QAbstractItemModel that gets its data from a series of Xml files, all situated in the same directory. Since PyQt5 no longer supports QDomDocument (or atleast i couldn't find a way to make it work), i've had to resort to a QXmlStreamReader. I'm putting the data itself in a giant python dictionary (well... not exactly giant by computer science standards) that contains other dictionaries under various keys to create a tree-like structure.

this is my code so far:

class DataModel(QtCore.QAbstractItemModel):
    def __init__(self, settingsDirectory, parent = None):
        super(DataModel, self).__init__(parent)
        settingsDirectory.setNameFilters(["*.xml"])
        files = settingsDirectory.entryList()
        print(files)

        self.data = {}

        for i in range(len(files)):
            filePath = str(files[i])
            file = QtCore.QFile(settingsDirectory.absolutePath() + "/" + str(filePath))
            fileOpens = file.open(file.ReadOnly | file.Text)
            if fileOpens:
                parser = QtCore.QXmlStreamReader(file)
                print("--------Beginning parsing----------")
                print("Reading file: "+str(filePath))
                while not parser.atEnd():
                    parser.readNext()

                    token = parser.tokenType()

                    print("Reading tag: " + str(parser.name()))
                    print("Tag type is: " + str(token))
                    if token == parser.StartDocument:
                        self.data["XML Version"] = str(parser.documentVersion())
                        self.data["XML Encoding"] = str(parser.documentEncoding())
                    if token == parser.StartElement:
                        tokenName = parser.name()
                    if parser.tokenType() == parser.Characters:
                        tokenText = parser.text()
                        print("This tag has a text value: " + str(tokenText))
                        print("current data: " + str(self.data))
                    if token == parser.EndElement:
                        if tokenText != None:
                            self.data[tokenName] = tokenText
                        else:
                            self.data[tokenName] = {}
                        tokenName = None
                        tokenText = None
            else:
                print(self.tr("xml file did not open properly"))
        print(self.data)

While this code doesn't crash or anything, it does have a few issues that i have no idea why they're happening or how to fix:

1.the tokenName never changes from None for some reason - solved

2.the structure of the self.data dictionary does not turn into a tree-like one, no idea why :|

example data:

<?xml version="1.0" encoding="UTF-8"?>
<tag>
    <description>This is a text</description>
    <types>
        <typesAllowed></typesAllowed>
        <typesEnabled></typesEnabled>
    </types>
</tag>

yields the final result:

{'XML Encoding': 'UTF-8', 'XML Version': '1.0', 'typesAllowed': '\n\t\t', None: '\n', 'typesEnabled': '\n\t\t', 'description': 'This is a text'}

instead of the wanted:

{'XML Encoding': 'UTF-8', 'XML Version': '1.0', 'tag': {'description': 'this is a text', typesAllowed': '\n\t\t', 'typesEnabled': '\n\t\t'}}

I know these issues are most likely a result of my poor understanding of how a StreamReader works, so any and all tips would be welcome :)

edit 1:

the tokenName change was a silly positioning error, silly me. the code reflects the fix.

edit 2:

added an example and example output


Solution

  • This question is now solved; I took a different approach to the problem.

    I basically took a list into which i appended tuples (name, {}) if the StartElement token had the attribute parseAs == "element" and put an evaluated string (parseText function) into the last tuple's dictionary. When it meets an EndElement token, it finds the tuple with name == tokenName, which is the name of the current token, puts it into the previous tuple's dictionary as an entry with key name.

    There's a few more details as to how it works, but I'd probably just overly complicate my explanation if I included them (how it knows when to submit currData to self.data etc.)

    class DataModel(QtCore.QAbstractItemModel):
        def __init__(self, settingsDirectory, parent = None):
            super(DataModel, self).__init__(parent)
            settingsDirectory.setNameFilters(["*.xml"])
            files = settingsDirectory.entryList()
            print(files)
    
            self.data = {}
            self.parsingLog = {}
    
            for i in range(len(files)):
                filePath = str(files[i])
                file = QtCore.QFile(settingsDirectory.absolutePath() + "/" + str(filePath))
                fileOpens = file.open(file.ReadOnly | file.Text)
                if fileOpens:
                    parser = QtCore.QXmlStreamReader(file)
    
                    currData = []
                    haveStartToken = False
    
                    print(self.tr("--------Beginning parsing--------"))
                    print(self.tr("Reading file: "+str(filePath)))
                    print(self.tr("---------------------------------"))
    
                    while not parser.atEnd():
                        if not parser.hasError():
                            parser.readNext()
                            token = parser.tokenType()
    
                            print(self.tr("--------------------"))
                            print(self.tr("Token type: " + str(self.printTokenType(token))))
    
                            if token == parser.StartElement:
                                tokenName = parser.name()
                                attributes = parser.attributes()
                                parseAs = attributes.value("parseAs")
    
                                print(self.tr("Reading StartElement: " + str(tokenName)))
                                print(self.tr("parseAs: " + str(parseAs)))
    
                                if parseAs == "text":
                                    textValue = self.parseText(parser.readElementText())
                                    print(self.tr("Text Value: " + str(textValue)))
    
                                    if len(currData) != 0:
                                        currData[len(currData)-1][1][tokenName] = textValue
                                    else:
                                        print(self.tr("*******Terminating application*******"))
                                        print(self.tr("Reason: currData is empty"))
                                        print(self.tr("*******Terminating application*******"))
                                        sys.exit()
                                elif parseAs == "element":
                                    currData.append((tokenName, {}))
                                else:
                                    print(self.tr("******WARNING******"))
                                    print(self.tr("parseAs attribute is not given correctly"))
                                    print(self.tr("******WARNING******"))
    
                                print(self.tr("--------------------"))
    
                            elif token == parser.EndElement:
                                tokenName = parser.name()
    
                                print(self.tr("Reading EndElement: " + str(tokenName)))
                                print(self.tr("currData before: " + str(currData)))
    
                                if not haveStartToken:
                                    startToken = currData[0][0]
                                    haveStartToken = True
    
                                for i in currData:
                                    if i[0] == tokenName:
                                        print(self.tr("Closing token: " + str(tokenName)))
                                        if i[0] != startToken:
                                            currData[len(currData)-2][1][tokenName] = currData[len(currData)-1][1]
                                            del currData[len(currData)-1]
                                            print(self.tr("currData after: " + str(currData)))
                                            print(self.tr("--------------------"))
                                        elif i[0] == startToken:
                                            print(self.tr("This is the final token, writing to self.data"), end = "")
                                            self.data[startToken] = currData[0][1]
                                            for i in range(5):
                                                time.sleep(0.25)
                                                print(self.tr("."), end = "")
                                            print(self.tr("done."))
                                            print(self.tr("--------------------"))
    
                            elif token == parser.Characters:
                                print(self.tr("Characters value: " + str(parser.text())))
                                print(self.tr("--------------------"))
    
                            elif token == parser.StartDocument:
                                self.parsingLog["File: "+str(filePath)] = {}
                                self.parsingLog["File: "+str(filePath)]["XML Version"] = str(parser.documentVersion())
                                self.parsingLog["File: "+str(filePath)]["XML Encoding"] = str(parser.documentEncoding())
                                print(self.tr("File Version: " + str(self.parsingLog["File: "+str(filePath)]["XML Version"])))
                                print(self.tr("File Encoding: " + str(self.parsingLog["File: "+str(filePath)]["XML Encoding"])))
    
                            elif token == parser.EndDocument:
                                print(self.tr("Cleaning up"), end = "")
                                for i in range(5):
                                    time.sleep(0.25)
                                    print(self.tr("."), end = "")
                                time.sleep(0.1)
                                print(self.tr("done."))
                                print(self.tr("self.data: " + str(self.data)))
                                print(self.tr("types of data: yesNo (should be str) - " +
                                              str(type(self.data["building"]["specialSlot"]["yesNo"])) +
                                              " - id - should be int - " + str(type(self.data["building"]["specialSlot"]["id"])) +
                                              " - isItFloat - should be float - " + str(type(self.data["building"]["specialSlot"]["isItFloat"]))))
                                print(self.tr("--------------------"))
    
                        else:
                            print(self.tr("XML file is not well-formatted"))
    
    
    
                else:
                    print(self.tr("xml file did not open properly"))
    
        def parseText(self, text):
            if isinstance(text, str):
                if text == "":
                    return str(text)
                for i in text:
                    if i not in ("0123456789."):
                        return str(text)
                for j in text:
                    if j not in ("0123456789"):
                        return float(text)
                return int(text)
            else:
                return ValueError
    
        def printTokenType(self, token):
            if token == QtCore.QXmlStreamReader.NoToken:
                return "NoToken"
            elif token == 1:
                return "Invalid"
            elif token == QtCore.QXmlStreamReader.StartDocument:
                return "StartDocument"
            elif token == QtCore.QXmlStreamReader.EndDocument:
                return "EndDocument"
            elif token == QtCore.QXmlStreamReader.StartElement:
                return "StartElement"
            elif token == QtCore.QXmlStreamReader.EndElement:
                return "EndElement"
            elif token == QtCore.QXmlStreamReader.Characters:
                return "Characters"
            elif token == QtCore.QXmlStreamReader.Comment:
                return "Comment"
            elif token == QtCore.QXmlStreamReader.DTD:
                return "DTD"
            elif token == QtCore.QXmlStreamReader.EntityReference:
                return "EntityReference"
            elif token == QtCore.QXmlStreamReader.ProcessingInstruction:
                return "ProcessingInstruction"