python unit-testing diff python-unittest difference

How to wrap correctly the unit testing diff?

I am running python 3.6.4, but some times the unit testing diff does not work as expected. For example, on the following there is a forced unit test error versus expected where the line wrapping behavior is not desired.

import unittest

class TestSemanticRules(unittest.TestCase):
    maxDiff = None

    def test_badWrapping(self):
        self.assertEqual(
            "1. Duplicated target language name defined in your grammar on: [@-1,63:87='Abstract Machine Language'<__ANON_3>,3:19]\n"
            "2. Duplicated master scope name defined in your grammar on: [@-1,138:147='source.sma'<__ANON_3>,5:20]"
            ,
            "1. Duplicated target language name defined in your grammar on: free_input_string\n"
            "  text_chunk_end  Abstract Machine Language"
            "\n"
            "2. Duplicated master scope name defined in your grammar on: free_input_string\n"
            "  text_chunk_end  source.sma"
        )

unittest.main(failfast=True)

Running it with python3 test.py you see the first error diff line does not got wrapped:

An expected result would be:

I tried searching for an alternative diff library, then I tried replacing the unittest diff by a custom diff library as the built-in difflib, but the diff results where the same. So, I assume the unittest package is using the difflib.

import unittest
import difflib

class TestSemanticRules(unittest.TestCase):
    maxDiff = None

    def myAssertEquals(self, expected, actual):
        expected = expected.splitlines( 1 )
        actual = actual.splitlines( 1 )

        if expected != actual:
            diff = difflib.context_diff( expected, actual, fromfile='expected input', tofile='actual output', lineterm='\n' )
            self.fail( '\n' + ''.join( diff ) )

    def test_badWrapping(self):
        self.myAssertEquals(
            "1. Duplicated target language name defined in your grammar on: [@-1,63:87='Abstract Machine Language'<__ANON_3>,3:19]\n"
            "2. Duplicated master scope name defined in your grammar on: [@-1,138:147='source.sma'<__ANON_3>,5:20]"
            ,
            "1. Duplicated target language name defined in your grammar on: free_input_string\n"
            "  text_chunk_end  Abstract Machine Language"
            "\n"
            "2. Duplicated master scope name defined in your grammar on: free_input_string\n"
            "  text_chunk_end  source.sma"
        )

Can the difflib built-in library used by the unittest package to be configured, so, this behavior does not happen? Or there is an reliable alternative for the difflib package?

Solution

Searching for alternatives to difflib I got 3 results:

https://github.com/Carreau/difflib2.py (4 years with no updates)
https://github.com/google/diff-match-patch
https://github.com/seperman/deepdiff

Then, using diff-match-patch I manage to build the following code:

import re
import unittest

import textwrap
import diff_match_patch

class DiffMatchPatch(diff_match_patch.diff_match_patch):

    def diff_prettyText(self, diffs):
        """Convert a diff array into a pretty Text report.
        Args:
          diffs: Array of diff tuples.
        Returns:
          Text representation.
        """
        results_diff = []
        cut_next_new_line = [False]
        # print('\ndiffs:\n%s\n' % diffs)

        operations = (self.DIFF_INSERT, self.DIFF_DELETE)

        def parse(sign):
            # print('new1:', text.encode( 'ascii' ))

            if text:
                new = text

            else:
                return ''

            new = textwrap.indent( "%s" % new, sign, lambda line: True )

            # force the diff change to show up on a new line for highlighting
            if len(results_diff) > 0:
                new = '\n' + new

            if new[-1] == '\n':

                if op == self.DIFF_INSERT and next_text and new[-1] == '\n' and next_text[0] == '\n':
                    cut_next_new_line[0] = True;

                    # Avoids a double plus sign showing up when the diff has the element (1, '\n')
                    if len(text) > 1: new = new + '%s\n' % sign

            elif next_op not in operations and next_text and next_text[0] != '\n':
                new = new + '\n'

            # print('new2:', new.encode( 'ascii' ))
            return new

        for index in range(len(diffs)):
            op, text = diffs[index]
            if index < len(diffs) - 1: 
                next_op, next_text = diffs[index+1]
            else:
                next_op, next_text = (0, "")

            if op == self.DIFF_INSERT:
                results_diff.append( parse( "+ " ) )

            elif op == self.DIFF_DELETE:
                results_diff.append( parse( "- " ) )

            elif op == self.DIFF_EQUAL:
                # print('new3:', text.encode( 'ascii' ))
                text = textwrap.indent(text, "  ")

                if cut_next_new_line[0]:
                    cut_next_new_line[0] = False
                    text = text[1:]

                results_diff.append(text)
                # print('new4:', text.encode( 'ascii' ))

        return "".join(results_diff)

    def diff_linesToWords(self, text1, text2, delimiter=re.compile('\n')):
        """
            Split two texts into an array of strings.  Reduce the texts to a string
            of hashes where each Unicode character represents one line.

            95% of this function code is copied from `diff_linesToChars` on:
                https://github.com/google/diff-match-patch/blob/895a9512bbcee0ac5a8ffcee36062c8a79f5dcda/python3/diff_match_patch.py#L381

            Copyright 2018 The diff-match-patch Authors.
            https://github.com/google/diff-match-patch
            Licensed under the Apache License, Version 2.0 (the "License");
            you may not use this file except in compliance with the License.
            You may obtain a copy of the License at
              http://www.apache.org/licenses/LICENSE-2.0

            Args:
                text1: First string.
                text2: Second string.
                delimiter: a re.compile() expression for the word delimiter type

            Returns:
                Three element tuple, containing the encoded text1, the encoded text2 and
                the array of unique strings.  The zeroth element of the array of unique
                strings is intentionally blank.
        """
        lineArray = []  # e.g. lineArray[4] == "Hello\n"
        lineHash = {}   # e.g. lineHash["Hello\n"] == 4

        # "\x00" is a valid character, but various debuggers don't like it.
        # So we'll insert a junk entry to avoid generating a null character.
        lineArray.append('')

        def diff_linesToCharsMunge(text):
            """Split a text into an array of strings.  Reduce the texts to a string
            of hashes where each Unicode character represents one line.
            Modifies linearray and linehash through being a closure.
            Args:
                text: String to encode.
            Returns:
                Encoded string.
            """
            chars = []
            # Walk the text, pulling out a substring for each line.
            # text.split('\n') would would temporarily double our memory footprint.
            # Modifying text would create many large strings to garbage collect.
            lineStart = 0
            lineEnd = -1
            while lineEnd < len(text) - 1:
                lineEnd = delimiter.search(text, lineStart)

                if lineEnd:
                    lineEnd = lineEnd.start()

                else:
                    lineEnd = len(text) - 1

                line = text[lineStart:lineEnd + 1]

                if line in lineHash:
                    chars.append(chr(lineHash[line]))
                else:
                    if len(lineArray) == maxLines:
                        # Bail out at 1114111 because chr(1114112) throws.
                        line = text[lineStart:]
                        lineEnd = len(text)
                    lineArray.append(line)
                    lineHash[line] = len(lineArray) - 1
                    chars.append(chr(len(lineArray) - 1))
                lineStart = lineEnd + 1
            return "".join(chars)

        # Allocate 2/3rds of the space for text1, the rest for text2.
        maxLines = 666666
        chars1 = diff_linesToCharsMunge(text1)
        maxLines = 1114111
        chars2 = diff_linesToCharsMunge(text2)
        return (chars1, chars2, lineArray)

class TestRules(unittest.TestCase):
    ## Set the maximum size of the assertion error message when Unit Test fail
    maxDiff = None

    ## Whether `characters diff=0`, `words diff=1` or `lines diff=2` will be used
    diffMode = 1

    def __init__(self, *args, **kwargs):
        diffMode = kwargs.pop('diffMode', -1)
        if diffMode > -1: self.diffMode = diffMode

        super(TestRules, self).__init__(*args, **kwargs)

    def setUp(self):
        if diff_match_patch: self.addTypeEqualityFunc(str, self.myAssertEqual)

    def myAssertEqual(self, expected, actual, msg=""):
        """
            How to wrap correctly the unit testing diff?
            https://stackoverflow.com/questions/52682351/how-to-wrap-correctly-the-unit-testing-diff
        """
        # print( '\n\nexpected\n%s' % expected )
        # print( '\n\nactual\n%s' % actual )

        if expected != actual:
            diff_match = DiffMatchPatch()

            if self.diffMode == 0:
                diffs = diff_match.diff_main(expected, actual)

            else:
                diff_struct = diff_match.diff_linesToWords(expected, actual,
                        re.compile(r'\b') if self.diffMode == 1 else re.compile(r'\n') )

                lineText1 = diff_struct[0] # .chars1;
                lineText2 = diff_struct[1] # .chars2;
                lineArray = diff_struct[2] # .lineArray;

                diffs = diff_match.diff_main(lineText1, lineText2, False);
                diff_match.diff_charsToLines(diffs, lineArray);
                diff_match.diff_cleanupSemantic(diffs)

            if msg:
                msg += '\n'

            else:
                msg = "The strings does not match...\n"

            self.fail( msg + diff_match.diff_prettyText(diffs) )

    def test_characthersDiffModeExample1(self):
        self.diffMode = 0
        expected = "1. Duplicated target language name defined in your grammar on: [@-1,63:87='Abstract Machine Language'<__ANON_3>,3:19]\n" \
                   "2. Duplicated master scope name defined in your grammar on: [@-1,138:147='source.sma'<__ANON_3>,5:20]"

        actual = "1. Duplicated target language name defined in your grammar on: free_input_string\n" \
                 "  text_chunk_end  Abstract Machine Language\n" \
                 "\n" \
                 "2. Duplicated master scope name defined in your grammar on: free_input_string\n" \
                 "  text_chunk_end  source.sma" \

        with self.assertRaises( AssertionError ) as error:
            self.myAssertEqual( expected, actual )

        print( '\nerror.exception\n%s\n' % str(error.exception) )
        self.assertEqual(
            "The strings does not match...\n"
            "  1. Duplicated target language name defined in your grammar on: \n"
            "- [@-1,63:87='\n"
            "+ free_input_string\n"
            "+   text_chunk_end  \n"
            "  Abstract Machine Language\n"
            "- '<__ANON_3>,3:19]\n"
            "+ \n"
            "  2. Duplicated master scope name defined in your grammar on: \n"
            "- [@-1,138:147='\n"
            "+ free_input_string\n"
            "+   text_chunk_end  \n"
            "  source.sma\n"
            "- '<__ANON_3>,5:20]"
            , str(error.exception) )

    def test_wordsDiffModeExample1(self):
        self.diffMode = 1
        expected = "1. Duplicated target language name defined in your grammar on: [@-1,63:87='Abstract Machine Language'<__ANON_3>,3:19]\n" \
                   "2. Duplicated master scope name defined in your grammar on: [@-1,138:147='source.sma'<__ANON_3>,5:20]"

        actual = "1. Duplicated target language name defined in your grammar on: free_input_string\n" \
                 "  text_chunk_end  Abstract Machine Language\n" \
                 "\n" \
                 "2. Duplicated master scope name defined in your grammar on: free_input_string\n" \
                 "  text_chunk_end  source.sma" \

        with self.assertRaises( AssertionError ) as error:
            self.myAssertEqual( expected, actual )

        print( '\nerror.exception\n%s\n' % str(error.exception) )
        self.assertEqual(
            "The strings does not match...\n"
            "  1. Duplicated target language name defined in your grammar on: \n"
            "- [@-1,63:87='Abstract Machine Language'<__ANON_3>,3:19]\n"
            "+ free_input_string\n"
            "+   text_chunk_end  Abstract Machine Language\n"
            "+ \n"
            "  2. Duplicated master scope name defined in your grammar on: \n"
            "- [@-1,138:147='source.sma'<__ANON_3>,5:20]\n"
            "+ free_input_string\n"
            "+   text_chunk_end  source.sma"
            , str(error.exception) )

    def test_linesDiffModeExample1(self):
        self.diffMode = 2
        expected = "1. Duplicated target language name defined in your grammar on: [@-1,63:87='Abstract Machine Language'<__ANON_3>,3:19]\n" \
                   "2. Duplicated master scope name defined in your grammar on: [@-1,138:147='source.sma'<__ANON_3>,5:20]"

        actual = "1. Duplicated target language name defined in your grammar on: free_input_string\n" \
                 "  text_chunk_end  Abstract Machine Language\n" \
                 "\n" \
                 "2. Duplicated master scope name defined in your grammar on: free_input_string\n" \
                 "  text_chunk_end  source.sma" \

        with self.assertRaises( AssertionError ) as error:
            self.myAssertEqual( expected, actual )

        print( '\nerror.exception\n%s\n' % str(error.exception) )
        self.assertEqual(
            "The strings does not match...\n"
            "- 1. Duplicated target language name defined in your grammar on: [@-1,63:87='Abstract Machine Language'<__ANON_3>,3:19]\n"
            "- 2. Duplicated master scope name defined in your grammar on: [@-1,138:147='source.sma'<__ANON_3>,5:20]\n"
            "+ 1. Duplicated target language name defined in your grammar on: free_input_string\n"
            "+   text_chunk_end  Abstract Machine Language\n"
            "+ \n"
            "+ 2. Duplicated master scope name defined in your grammar on: free_input_string\n"
            "+   text_chunk_end  source.sma"
            , str(error.exception) )

unittest.main(failfast=True, verbosity=2)

Using `diffMode=0` as characters

Using `diffMode=1` as words

Using `diffMode=2` as lines

Which seems already to be better than the built-in behavior from the unittest module. How could this new diff_prettyText() still be improved?

References

How to wrap correctly the unit testing diff?

Using diffMode=0 as characters

Using diffMode=1 as words

Using diffMode=2 as lines

Using `diffMode=0` as characters

Using `diffMode=1` as words

Using `diffMode=2` as lines