Search code examples
stringpython-3.xdifflibsequencematcher

python3, difflib SequenceMatcher


the following takes in two strings, compares differences and return them both as identicals as well as their differences, separated by spaces (maintaining the length of the longest sting.

The commented area in the code, are the 4 strings that should be returned.

from difflib import SequenceMatcher




t1 = 'betty:  backstreetvboysareback"give.jpg"LAlarrygarryhannyhref="ang"_self'

t2 = 'bettyv:  backstreetvboysareback"lifeislike"LAlarrygarryhannyhref="in.php"_self'


#t1 = 'betty :  backstreetvboysareback" i e      "LAlarrygarryhannyhref=" n    "_self'
#t2 = 'betty :  backstreetvboysareback" i e      "LAlarrygarryhannyhref=" n    "_self'

#o1 = '                                g v .jpg                          g           '
#o2 = '     v                          l f islike                        i .php      '



matcher = SequenceMatcher(None, t1, t2)
blocks = matcher.get_matching_blocks()

bla1 = []
bla2 = []

for i in range(len(blocks)):
    if i != len(blocks)-1:
        bla1.append([t1[blocks[i].a + blocks[i].size:blocks[i+1].a], blocks[i].a + blocks[i].size, blocks[i+1].a])
        bla2.append([t2[blocks[i].b + blocks[i].size:blocks[i+1].b], blocks[i].b + blocks[i].size, blocks[i+1].b])



cnt = 0
for i in range(len(bla1)):


    if bla1[i][1] < bla2[i][1]:
        num = bla2[i][1] - bla1[i][1]
        t2 = t2[0:bla2[i][1]] + ' '*num + t2[bla2[i][1]:len(t2)]
        bla2[i][0] = ' '*num + bla2[i][0]
        bla2[i][1] = bla1[i][1]

    if bla2[i][1] < bla1[i][1]:
        num = bla1[i][1] - bla2[i][1]
        t1 = t1[0:bla1[i][1]] + ' '*num + t1[bla1[i][1]:len(t1)]
        bla1[i][0] = ' '*num + bla1[i][0]
        bla1[i][1] = bla2[i][1]

    if bla1[i][2] > bla2[i][2]:
        num = bla1[i][2] - bla2[i][2]
        t2 = t2[0:bla2[i][2]] + ' '*num + t2[bla2[i][2]:len(t2)]
        bla2[i][0] = bla2[i][0] + ' '*num
        bla2[i][2] = bla1[i][2]

    if bla2[i][2] > bla1[i][2]:
        num = bla2[i][2] - bla1[i][2]
        t1 = t1[0:bla1[i][2]] + ' '*num + t1[bla1[i][2]:len(t1)]
        bla1[i][0] = bla1[i][0] + ' '*num
        bla1[i][2] = bla2[i][2]




t11 = []
t11 = t1[0:bla1[0][1]]
t11 += t1[bla1[0][2]:bla1[1][1]]
t11 += t1[bla1[1][2]:bla1[2][1]]
t11 += t1[bla1[2][2]:bla1[3][1]]
t11 += t1[bla1[3][2]:bla1[4][1]]
t11 += t1[bla1[5][2]:bla1[6][1]]
t11 += t1[bla1[6][2]:len(t1)]

t12 = []
t12 = t2[0:bla1[0][1]]
t12 += t2[bla1[0][2]:bla1[1][1]]
t12 += t2[bla1[1][2]:bla1[2][1]]
t12 += t2[bla1[2][2]:bla1[3][1]]
t12 += t2[bla1[3][2]:bla1[4][1]]
t12 += t2[bla1[5][2]:bla1[6][1]]
t12 += t2[bla1[6][2]:len(t2)]

After ranging the blocks into an organised format bla1, bla2 where each difference is stored as a string with its start and end position eg ['v', 33, 34] for each separate string. After this, I attempt to insert spaces to match the length and separation factors necessary and this is where the code starts to break.

Please if someone could take a look!


Solution

  • I have worked through resolving this, and since no one has posted a response I will post the progress and solution. The following code is progress ... it worked well when dealing with variations that had less offset but began to break when getting into larger differences, specifically in maintaining spacing (offset) in matching up the two.

    from difflib import SequenceMatcher
    import pdb
    
    
    t1 = 'betty:  backstreetvboysareback"give.jpg"LAlarrygarryhannyhref="ang"_self'
    
    t2 = 'betty:  backstreetvboysareback"lol.jpg"LAlarrygarryhannyhref="ang"_self'
    
    #t2 = 'bettyv:  backstreetvboysareback"lifeislike"LAlarrygarryhannyhref="in.php"_selff'
    
    #t2 = 'LA'
    #t2 = 'c give.'
    #t2 = 'give.'
    
    
    
    
    #t1 = 'betty :  backstreetvboysareback" i e      "LAlarrygarryhannyhref=" n    "_self'
    #t2 = 'betty :  backstreetvboysareback" i e      "LAlarrygarryhannyhref=" n    "_self'
    
    #o1 = '                                g v .jpg                          g           '
    #o2 = '     v                          l f islike                        i .php      '
    
    
    
    matcher = SequenceMatcher(None, t1, t2)
    blocks = matcher.get_matching_blocks()
    
    #print(len(blocks))
    
    bla1 = []
    bla2 = []
    
    #bla = (string), (first pos), (second pos), (pos1 + pos2), (pos + pos2 total positions added togeather)
    dnt = False
    for i in range(len(blocks)):
    
        if i == 0:
          if blocks[i].a != 0 and dnt == False:
            bla1.append([t1[blocks[i].a:blocks[i].b], 0, blocks[i].a, 0, 0])
            bla2.append([t2[blocks[i].a:blocks[i].b], 0, blocks[i].b, 0, 0])
            dnt = True
    
          if blocks[i].b != 0 and dnt == False:
            bla2.append([t2[blocks[i].a:blocks[i].b], 0, blocks[i].b, 0, 0])
            bla1.append([t1[blocks[i].a:blocks[i].b], 0, blocks[i].a, 0, 0])
            dnt = True
    
        if i != len(blocks)-1:
            print(blocks[i])
    
            bla1.append([t1[blocks[i].a + blocks[i].size:blocks[i+1].a], blocks[i].a + blocks[i].size, blocks[i+1].a, 0, 0])
            bla2.append([t2[blocks[i].b + blocks[i].size:blocks[i+1].b], blocks[i].b + blocks[i].size, blocks[i+1].b, 0, 0])
    
    #pdb.set_trace()
    
    ttl = 0
    for i in range(len(bla1)):
      cnt = bla1[i][2] - bla1[i][1]
      if cnt != 0:
        bla1[i][3] = cnt
      ttl = ttl + cnt
      bla1[i][4] = ttl
    
    ttl = 0
    for i in range(len(bla2)):
      cnt = bla2[i][2] - bla2[i][1]
      if cnt != 0:
        bla2[i][3] = cnt
      ttl = ttl + cnt
      bla2[i][4] = ttl
    
    print(bla1)
    print(bla2)
    
    tt1 = ''
    dif = 0
    i = 0
    while True:
    
      if i == 0:
        if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
        if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]  
        tt1 += t1[:bla1[i][1]] + '_'*dif
    
      if i <= len(bla1) -1:
    
        if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
        if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]
    
        if len(bla1) != 1:
          if i == 0: tt1 += t1[bla1[i][1] + bla1[i][3]:bla1[i+1][1]]
          if i != 0 and i != len(bla1)-1: tt1 += '_'*dif + t1[bla1[i][1] + bla1[i][3]:bla1[i+1][1]]
          if i == len(bla1)-1: tt1 += '_'*dif + t1[bla1[i][1] + bla1[i][3]:len(t1)]
    
        i = i+1
        print('t1 = ' + tt1)
    
      else:
        break
    
    tt2 = ''
    i = 0
    dif = 0
    while True:
    
      if i == 0:
    
        if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
        if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]   
        tt2 += t2[:bla2[i][1]] + '_'*dif
    
      if i <= len(bla2) -1:
    
        if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
        if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]    
    
        if len(bla2) != 1:
          if i == 0: tt2 += t2[bla2[i][1] + bla2[i][3]:bla2[i+1][1]]
          if i != 0 and i != len(bla1)-1: tt2 += '_'*dif + t2[bla2[i][1] + bla2[i][3]:bla2[i+1][1]]
          if i == len(bla2)-1: tt2 += '_'*dif + t2[bla2[i][1] + bla2[i][3]:len(t2)]
    
        i = i+1
        print('t2 = ' + tt2)
    
      else:
        break
    
      print()
    

    Solution:

    Unfortunately I have been too busy to continue coding this and have resorted to sub-processing diffutils ... this is a wonderful alternative to a lot of painstaking coding!