How to parse through XML that contains HTML

I'm trying to parse through Adium's XML format in Python. I'm looking to build a clean database of chats but clearing out all the formatting and hyperlinks.

I'm using xmltodict right now to create lists/dictionaries to iterate through it. But I'm running into major problems any time I hit a hyperlink or text formatting. I think because I'm trying to brute force through the XML. It places the additional tags deeper in the list/dictionary.

Basically, I feel like I'm approaching this wrong.

Here are two snippets of the XMLs I'm working with.

XML 1

<?xml version="1.0" encoding="UTF-8" ?>
<chat xmlns="http://purl.org/net/ulf/ns/0.4-02" account="someusername" service="AIM">
<message sender="someusername" time="2008-07-27T18:02:34-0700"><div><span style="font-family: Arial; font-size: 10pt;">time is not of the essence</span></div></message>
<message sender="someusername" time="2008-07-27T18:02:43-0700"><div><span style="font-family: Arial; font-size: 10pt;">it <span style="font-style: italic;">is</span></span><span style="font-family: Helvetica; font-size: 12pt;"> </span><span style="font-family: Arial; font-size: 10pt;">the essence</span></div></message>
<message sender="anotherusername" time="2008-07-27T18:03:49-0700"><div><span style="color: #000000; font-family: Helvetica; font-size: 12pt;">yo</span></div></message>
<message sender="anotherusername" time="2008-07-27T18:03:51-0700"><div><span style="color: #000000; font-family: Helvetica; font-size: 12pt;">whats up?</span></div></message></chat>

XML 2

<?xml version="1.0" encoding="UTF-8" ?>
    <chat xmlns="http://purl.org/net/ulf/ns/0.4-02" account="someusername" service="AIM">
    <message sender="someusername" time="2009-09-26T05:54:23-0700"><div><a href="http://www.youtube.com/watch?v=LqbJx4TFFEE&amp;feature=related" style="color: #000000; font-family: Helvetica; font-size: 12pt;">http://www.youtube.com/watch?v=LqbJx4TFFEE&amp;feature=related</a></div></message>
    <message sender="someusername" time="2009-09-27T16:12:29-0700"><div><span style="color: #000000; font-family: Helvetica; font-size: 12pt;">2nd take, with the bonus stuff I think</span></div></message>
    <message sender="someusername" time="2009-09-27T17:18:52-0700"><div></div></message></chat>

And this is the code I've been working with (apologies, there's some nonsense in there):

import xmltodict
import os

def get_list_of_all_files_in_sub(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    all_files_with_extension = list()

    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + get_list_of_all_files_in_sub(fullPath)
        else:
            allFiles.append(fullPath)
    return allFiles

def get_files_with_extension(path, file_extension=""):
    # gets a list of all files with a certain extension in a folder and all subfolders
    files = get_list_of_all_files_in_sub(path)

    all_files_with_extension = []
    for file in files:
        if file.split(".")[-1] == file_extension:
            all_files_with_extension.append(file)
    return all_files_with_extension

allmessages = []

files = get_files_with_extension("/Users/Desktop/chats", "chatlog")

for file in files:
    print (file)
    with open(file) as fd:
        doc = xmltodict.parse(fd.read())

    messages = doc['chat']['message']

    # this is gross, but apparently if "messages" only has one entry, it doesn't return a list.  So
    # to remedy this, im craming it into a list and back into itself to work with the rest of the code.
    if type(messages) is not list:
        print ("NOT A LIST")
        messages_container = messages
        messages = [messages_container]

    for message in messages:
        # Check to see if the SPAN exists inside DIV, which basically is checking to see if there's a real message in it.
        if 'span' in message["div"]:
            # checks if there's a sender, if there's no sender, it just doesn't include it in the output.
            if message["@sender"] != "":
                time =      (message["@time"])
                print (time)
                username =  (message["@sender"])
                print (username)

                # SET THE MESSAGE
                # If there are multiple messages within one message, then it comes in as a list.
                # But so far its just been things like warnings and offline notifications.
                # This seems to happen with AIM messages.
                if type(message["div"]['span']) is list:
                    print (message["div"]['span'])
                    for submessage in message["div"]['span']:
                        for subsubmessage in submessage:
                            print ("---------------1----------------")
                            print (subsubmessage)
                            print ("---------------2----------------")
                            if type(subsubmessage) is list:
                                print (subsubmessage["#text"])
                                if "Offline IM sent" not in subsubmessage["#text"]:
                                    text_message =  (subsubmessage["#text"])
                                    print (text_message)
                else:
                    text_message =  (message["div"]['span']["#text"])
                    print (text_message)

                if len(allmessages) > 0:
                    if (username == allmessages[-1]["sender"]):
                        if  (allmessages[-1]["message"].endswith('.')):
                            text_message = allmessages[-1]["message"] + "  " + text_message
                        else:
                            text_message = allmessages[-1]["message"] + ".  " + text_message

                        del allmessages[-1]

                newmessage = {  'time'      : time, 
                                'sender'    : username, 
                                'message'   : text_message
                            }

                allmessages.append (newmessage)
                #print ("{} {}: {}".format(time, username, message))

for message in x:
    print ("{} {}: {}".format(message['time'], message['sender'], message['message']))

I noticed that the way xmltodict processes the html tags, it turns into this on output:

OrderedDict([('span', OrderedDict([('@style', 'font-family: Arial; font-size: 10pt;'), ('#text', 'time is not of the essence')]))])
OrderedDict([('span', [OrderedDict([('@style', 'font-family: Arial; font-size: 10pt;'), ('span', OrderedDict([('@style', 'font-style: italic;'), ('#text', 'is')])), ('#text', 'it')]), OrderedDict([('@style', 'font-family: Helvetica; font-size: 12pt;')]), OrderedDict([('@style', 'font-family: Arial; font-size: 10pt;'), ('#text', 'the essence')])])])

As you can see, the #text with the formatting gets yanked out and separated. Any other ways or thoughts on how to do this that might work better?

Solution

You can use BeautifulSoup, and use xml as type of parser:

from bs4 import BeautifulSoup
import datetime
from pprint import pprint

xml_1 = '''<?xml version="1.0" encoding="UTF-8" ?>
<chat xmlns="http://purl.org/net/ulf/ns/0.4-02" account="someusername" service="AIM">
<message sender="someusername" time="2008-07-27T18:02:34-0700"><div><span style="font-family: Arial; font-size: 10pt;">time is not of the essence</span></div></message>
<message sender="someusername" time="2008-07-27T18:02:43-0700"><div><span style="font-family: Arial; font-size: 10pt;">it <span style="font-style: italic;">is</span></span><span style="font-family: Helvetica; font-size: 12pt;"> </span><span style="font-family: Arial; font-size: 10pt;">the essence</span></div></message>
<message sender="anotherusername" time="2008-07-27T18:03:49-0700"><div><span style="color: #000000; font-family: Helvetica; font-size: 12pt;">yo</span></div></message>
<message sender="anotherusername" time="2008-07-27T18:03:51-0700"><div><span style="color: #000000; font-family: Helvetica; font-size: 12pt;">whats up?</span></div></message></chat>'''

xml_2 = '''<?xml version="1.0" encoding="UTF-8" ?>
    <chat xmlns="http://purl.org/net/ulf/ns/0.4-02" account="someusername" service="AIM">
    <message sender="someusername" time="2009-09-26T05:54:23-0700"><div><a href="http://www.youtube.com/watch?v=LqbJx4TFFEE&amp;feature=related" style="color: #000000; font-family: Helvetica; font-size: 12pt;">http://www.youtube.com/watch?v=LqbJx4TFFEE&amp;feature=related</a></div></message>
    <message sender="someusername" time="2009-09-27T16:12:29-0700"><div><span style="color: #000000; font-family: Helvetica; font-size: 12pt;">2nd take, with the bonus stuff I think</span></div></message>
    <message sender="someusername" time="2009-09-27T17:18:52-0700"><div></div></message></chat>'''

def parse_xml(xml_string):
    soup = BeautifulSoup(xml_string, 'xml')
    data = []
    for message in soup.select('message[sender][time]'):
        account = message.find_parent('chat')['account']
        sender = message['sender']
        d = datetime.datetime.strptime(message['time'], "%Y-%m-%dT%H:%M:%S%z")  # ISO 8601 time format
        text = message.text.strip()
        data.append((account, sender, d, text))
    return data

pprint(parse_xml(xml_1))
pprint(parse_xml(xml_2))

Prints:

[('someusername',
  'someusername',
  datetime.datetime(2008, 7, 27, 18, 2, 34, tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))),
  'time is not of the essence'),
 ('someusername',
  'someusername',
  datetime.datetime(2008, 7, 27, 18, 2, 43, tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))),
  'it is the essence'),
 ('someusername',
  'anotherusername',
  datetime.datetime(2008, 7, 27, 18, 3, 49, tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))),
  'yo'),
 ('someusername',
  'anotherusername',
  datetime.datetime(2008, 7, 27, 18, 3, 51, tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))),
  'whats up?')]
[('someusername',
  'someusername',
  datetime.datetime(2009, 9, 26, 5, 54, 23, tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))),
  'http://www.youtube.com/watch?v=LqbJx4TFFEE&feature=related'),
 ('someusername',
  'someusername',
  datetime.datetime(2009, 9, 27, 16, 12, 29, tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))),
  '2nd take, with the bonus stuff I think'),
 ('someusername',
  'someusername',
  datetime.datetime(2009, 9, 27, 17, 18, 52, tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))),
  '')]