Search code examples
pythontkinterhighlight

Highlight differences between two xml files in a Tkinter textbox


I tried all kinds of logic and methods and even googled a lot, but yet not able to think of any satisfactory answer for the question I have. I have wrote a program as shown below to highlight specific xml code where I am facing some problem. Sorry for making this post bit long. I only wanted to clearly explain my problem.

EDIT: For running below given program you will need two xml files which are here: sample1 and sample2. Save this files and in below code edit the location where you want to save your files in C:/Users/editThisLocation/Desktop/sample1.xml

from lxml import etree
from collections import defaultdict
from collections import OrderedDict
from distutils.filelist import findall
from lxml._elementpath import findtext

from Tkinter import * 
import Tkinter as tk
import ttk

root = Tk()

class CustomText(tk.Text):

    def __init__(self, *args, **kwargs):
        tk.Text.__init__(self, *args, **kwargs)


    def highlight_pattern(self, pattern, tag, start, end,
                          regexp=True):

        start = self.index(start)
        end = self.index(end)
        self.mark_set("matchStart", start)
        self.mark_set("matchEnd", start)
        self.mark_set("searchLimit", end)

        count = tk.IntVar()
        while True:
            index = self.search(pattern, "matchEnd","searchLimit",
                                count=count, regexp=regexp)
            if index == "": break
            self.mark_set("matchStart", index)
            self.mark_set("matchEnd", "%s+%sc" % (index, count.get()))
            self.tag_add(tag, "matchStart", "matchEnd")

    def Remove_pattern(self, pattern, tag, start="1.0", end="end",
                          regexp=True):

        start = self.index(start)
        end = self.index(end)
        self.mark_set("matchStart", start)
        self.mark_set("matchEnd", start)
        self.mark_set("searchLimit", end)

        count = tk.IntVar()
        while True:
            index = self.search(pattern, "matchEnd","searchLimit",
                                count=count, regexp=regexp)
            if index == "": break
            self.mark_set("matchStart", index)
            self.mark_set("matchEnd", "%s+%sc" % (index, count.get()))
            self.tag_remove(tag, start, end)



recovering_parser = etree.XMLParser(recover=True)


sample1File = open('C:/Users/editThisLocation/Desktop/sample1.xml', 'r')
contents_sample1 = sample1File.read()

sample2File = open('C:/Users/editThisLocation/Desktop/sample2.xml', 'r')
contents_sample2 = sample2File.read()


frame1 = Frame(width=768, height=25, bg="#000000", colormap="new")
frame1.pack()
Label(frame1, text="sample 1 below - scroll to see more").pack()

textbox = CustomText(root)
textbox.insert(END,contents_sample1)
textbox.pack(expand=1, fill=BOTH)

frame2 = Frame(width=768, height=25, bg="#000000", colormap="new")
frame2.pack()
Label(frame2, text="sample 2 below - scroll to see more").pack()


textbox1 = CustomText(root)
textbox1.insert(END,contents_sample2)
textbox1.pack(expand=1, fill=BOTH)

sample1 = etree.parse("C:/Users/editThisLocation/Desktop/sample1.xml", parser=recovering_parser).getroot()
sample2 = etree.parse("C:/Users/editThisLocation/Desktop/sample2.xml", parser=recovering_parser).getroot()

ToStringsample1 = etree.tostring(sample1)
sample1String = etree.fromstring(ToStringsample1, parser=recovering_parser)

ToStringsample2 = etree.tostring(sample2)
sample2String = etree.fromstring(ToStringsample2, parser=recovering_parser)

timesample1 = sample1String.findall('{http://www.example.org/eHorizon}time')
timesample2 =  sample2String.findall('{http://www.example.org/eHorizon}time')

for i,j in zip(timesample1,timesample2):       

    for k,l in zip(i.findall("{http://www.example.org/eHorizon}feature"), j.findall("{http://www.example.org/eHorizon}feature")):

        if [k.attrib.get('color'), k.attrib.get('type')] != [l.attrib.get('color'), l.attrib.get('type')]:

            faultyLine = [k.attrib.get('color'), k.attrib.get('type'), k.text]


def high(event):

    textbox.tag_configure("yellow", background="yellow")
    limit_1 = '<p1:time nTimestamp="{0}">'.format(5)     #limit my search between timestamp 5 and timestamp 6 
    limit_2 = '<p1:time nTimestamp="{0}">'.format((5+1)) # timestamp 6

    highlightString = '<p1:feature color="{0}" type="{1}">{2}</p1:feature>'.format(faultyLine[0],faultyLine[1],faultyLine[2]) #string to be highlighted

    textbox.highlight_pattern(limit_1, "yellow", start=textbox.search(limit_1, '1.0', stopindex=END), end=textbox.search(limit_2, '1.0', stopindex=END))
    textbox.highlight_pattern(highlightString, "yellow", start=textbox.search(limit_1, '1.0', stopindex=END), end=textbox.search(limit_2, '1.0', stopindex=END))


button = 'press here to highlight error line' 
c = ttk.Label(root, text=button)
c.bind("<Button-1>",high)
c.pack()  

root.mainloop()

What I want

If you run above code, it would present an output given below:

my output

As you can see in the image, I only intend to highlight code marked with green tick. Some of you might think of limiting the starting and ending index to highlight that pattern. However, if you see in my program I am already making use of starting and ending indexes for limiting my output to only nTimestamp="5" and for that I am using limit_1 and limit_2 variables.

So in this type of a data how to correctly highlight one pattern out of many inside individual nTimestamp?

EDIT: Here I specifically want to highlight 3rd item in nTimestamp="5" because this item is not present in sample2.xml as you can see in two xml files and when program runs it also differentiates this. The only problem is to highlight correct item which is 3rd in my case.

I am using highlighting class from Bryan Oakley's code here

EDIT Recent

In context to what kobejohn asked below in comments, the target file won't ever be empty. There are always chances that target file may have extra or missing elements. Finally, my current intention is to highlight only deep elements which are different or missing and timestamps in which they are located. However, highlighting of timestamps is done correctly but the issue to highlight deep elements like explained above is still an issue. Thank you kobejohn for clarifying this.

NOTE:

One method which I know and you might suggest that works correctly is to extract the index of green color ticked pattern and simply run highlight tag over it, but this approach is very hard-coded and in large data where you have to deal with lots of variations it is completely ineffective. I am searching for another better option.


Solution

  • This solution works by performing a simplified diff between base.xml and test.xml based on the description you provided. The diff result is a 3rd XML tree that combines the original trees. The output is the diff with color-coded highlighting for the lines that don't match between the files.

    I hope you can use this or adapt it to what you need.

    enter image description here

    Copy-Paste Script

    import copy
    from lxml import etree
    import Tkinter as tk
    
    
    # assumption: the root element of both trees is the same
    # note: missing subtrees will only have the parent element highlighted
    
    
    def element_content_equal(e1, e2):
        # starting point here: http://stackoverflow.com/a/24349916/377366
        try:
            if e1.tag != e1.tag:
                return False
            elif e1.text != e2.text:
                return False
            elif e1.tail != e2.tail:
                return False
            elif e1.attrib != e2.attrib:
                return False
        except AttributeError:
            # e.g. None is passed in for an element
            return False
        return True
    
    
    def element_is_in_sequence(element, sequence):
        for e in sequence:
            if element_content_equal(e, element):
                return True
        return False
    
    
    def copy_element_without_children(element):
        e_copy = etree.Element(element.tag, attrib=element.attrib, nsmap=element.nsmap)
        e_copy.text = element.text
        e_copy.tail = element.tail
        return e_copy
    
    
    # start at the root of both xml trees
    parser = etree.XMLParser(recover=True, remove_blank_text=True)
    base_root = etree.parse('base.xml', parser=parser).getroot()
    test_root = etree.parse('test.xml', parser=parser).getroot()
    # each element from the original xml trees will be placed into a merge tree
    merge_root = copy_element_without_children(base_root)
    
    
    # additionally each merge tree element will be tagged with its source
    DIFF_ATTRIB = 'diff'
    FROM_BASE_ONLY = 'base'
    FROM_TEST_ONLY = 'test'
    
    # process the pair of trees, one set of parents at a time
    parent_stack = [(base_root, test_root, merge_root)]
    while parent_stack:
        base_parent, test_parent, merge_parent = parent_stack.pop()
        base_children = base_parent.getchildren()
        test_children = test_parent.getchildren()
    
        # compare children and transfer to merge tree
        base_children_iter = iter(base_children)
        test_children_iter = iter(test_children)
        base_child = next(base_children_iter, None)
        test_child = next(test_children_iter, None)
        while (base_child is not None) or (test_child is not None):
            # first handle the case of a unique base child
            if (base_child is not None) and (not element_is_in_sequence(base_child, test_children)):
                # base_child is unique: deep copy with base only tag
                merge_child = copy.deepcopy(base_child)
                merge_child.attrib[DIFF_ATTRIB] = FROM_BASE_ONLY
                merge_parent.append(merge_child)
                # this unique child has already been fully copied to the merge tree so it doesn't go on the stack
                # only move the base child since test child hasn't been handled yet
                base_child = next(base_children_iter, None)
            elif (test_child is not None) and (not element_is_in_sequence(test_child, base_children)):
                # test_child is unique: deep copy with base only tag
                merge_child = copy.deepcopy(test_child)
                merge_child.attrib[DIFF_ATTRIB] = FROM_TEST_ONLY
                merge_parent.append(merge_child)
                # this unique child has already been fully copied to the merge tree so it doesn't go on the stack
                # only move test child since base child hasn't been handled yet
                test_child = next(test_children_iter, None)
            elif element_content_equal(base_child, test_child):
                # both trees share the same element: shallow copy either child with shared tag
                merge_child = copy_element_without_children(base_child)
                merge_parent.append(merge_child)
                # put pair of children on stack as parents to be tested since their children may differ
                parent_stack.append((base_child, test_child, merge_child))
                # move on to next children in both trees since this was a shared element
                base_child = next(base_children_iter, None)
                test_child = next(test_children_iter, None)
            else:
                raise RuntimeError  # there is something wrong - element should be unique or shared.
    
    # display merge_tree with highlighting to indicate source of each line
    #   no highlight: common element in both trees
    #   green: line that exists only in test tree (i.e. additional)
    #   red: line that exists only in the base tree (i.e. missing)
    root = tk.Tk()
    textbox = tk.Text(root)
    textbox.pack(expand=1, fill=tk.BOTH)
    textbox.tag_config(FROM_BASE_ONLY, background='#ff5555')
    textbox.tag_config(FROM_TEST_ONLY, background='#55ff55')
    
    # find diff lines to highlight within merge_tree string that includes kludge attributes
    merge_tree_string = etree.tostring(merge_root, pretty_print=True)
    diffs_by_line = []
    for line, line_text in enumerate(merge_tree_string.split('\n')):
        for diff_type in (FROM_BASE_ONLY, FROM_TEST_ONLY):
            if diff_type in line_text:
                diffs_by_line.append((line+1, diff_type))
    
    # remove kludge attributes
    for element in merge_root.iter():
        try:
            del(element.attrib[DIFF_ATTRIB])
        except KeyError:
            pass
    merge_tree_string = etree.tostring(merge_root, pretty_print=True)
    
    # highlight final lines
    textbox.insert(tk.END, merge_tree_string)
    for line, diff_type in diffs_by_line:
        textbox.tag_add(diff_type, '{}.0'.format(line), '{}.0'.format(int(line)+1))
    root.mainloop()
    

    Inputs:

    Please note that I cleaned up the xml because I was getting inconsistent behavior with the original XML. The original was basically using back slashes instead of forward slashes and also had false closing slashes on opening tags.


    base.xml (in the same location as this script)

    <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
    <p1:sample1 xmlns:p1="http://www.example.org/eHorizon">
       <p1:time nTimestamp="5">
          <p1:location hours = "1" path = '1'>
             <p1:feature color="6" type="a">560</p1:feature>
             <p1:feature color="2" type="a">564</p1:feature>
             <p1:feature color="3" type="b">570</p1:feature>
             <p1:feature color="4" type="c">570</p1:feature>
          </p1:location>
          <p1:location hours = "5" path = '1'>
             <p1:feature color="6" type="a">560</p1:feature>
             <p1:feature color="7" type="b">570</p1:feature>
             <p1:feature color="8" type="c">580</p1:feature>
          </p1:location>
          <p1:location hours = "5" path = '1'>
             <p1:feature color="6" type="a">560</p1:feature>
          </p1:location>
       </p1:time>
       <p1:time nTimestamp="6">
          <p1:location hours = "1" path = '1'>
             <p1:feature color="2" type="a">564</p1:feature>
             <p1:feature color="3" type="b">570</p1:feature>
             <p1:feature color="4" type="c">570</p1:feature>
          </p1:location>
          <p1:location hours = "5" path = '1'>
             <p1:feature color="6" type="a">560</p1:feature>
             <p1:feature color="9" type="b">590</p1:feature>
             <p1:feature color="10" type="c">600</p1:feature>
          </p1:location>
          <p1:location hours = "5" path = '1'>
             <p1:feature color="6" type="a">560</p1:feature>
             <p1:feature color="7" type="b">570</p1:feature>
             <p1:feature color="8" type="c">580</p1:feature>
          </p1:location>
       </p1:time>
    </p1:sample1>
    

    test.xml (in the same location as this script)

    <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
    <p1:sample1 xmlns:p1="http://www.example.org/eHorizon">
       <p1:time nTimestamp="5">
          <p1:location hours = "1" path = '1'>
             <p1:feature color="6" type="a">560</p1:feature>
             <p1:feature color="2" type="a">564</p1:feature>
             <p1:feature color="3" type="b">570</p1:feature>
             <p1:feature color="4" type="c">570</p1:feature>
          </p1:location>
          <p1:location hours = "5" path = '1'>
             <p1:feature color="6" type="a">560</p1:feature>
             <p1:feature color="7" type="b">570</p1:feature>
             <p1:feature color="8" type="c">580</p1:feature>
          </p1:location>
          <p1:location hours = "5" path = '1'>
             <p1:feature color="9" type="b">1111</p1:feature>
             <p1:feature color="10" type="c">2222</p1:feature>
          </p1:location>
       </p1:time>
       <p1:time nTimestamp="6">
          <p1:location hours = "1" path = '1'>
             <p1:feature color="2" type="a">564</p1:feature>
             <p1:feature color="3" type="b">570</p1:feature>
             <p1:feature color="4" type="c">570</p1:feature>
          </p1:location>
          <p1:location hours = "5" path = '1'>
             <p1:feature color="6" type="a">560</p1:feature>
             <p1:feature color="9" type="b">590</p1:feature>
             <p1:feature color="10" type="c">600</p1:feature>
          </p1:location>
          <p1:location hours = "5" path = '1'>
             <p1:feature color="6" type="a">560</p1:feature>
             <p1:feature color="7" type="b">570</p1:feature>
             <p1:feature color="8" type="c">580</p1:feature>
          </p1:location>
       </p1:time>
    </p1:sample1>