Search code examples
pythonedit-distance

Discriminate edit distance


The levenshtein edit distance cares only about how many edits are done and not on what exactly they are, so the following two pairs will have the same edit distance.

("A P Moller - Maersk A", "A.P. Moller - Maersk A/S Class A")
("A P Moller - Maersk A", "A.P. Moller - Maersk A/S Class B")

Are there any algorithms or libraries that can distinguish between these two pairs?


Solution

  • You can use cosine similarity to find the similarity between to text and it produce different similarity between these two text

    import math
    import re
    from collections import Counter
    
    WORD = re.compile(r"\w+")
    
    
    def get_cosine(vec1, vec2):
        intersection = set(vec1.keys()) & set(vec2.keys())
        numerator = sum([vec1[x] * vec2[x] for x in intersection])
        sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
        sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
        denominator = math.sqrt(sum1) * math.sqrt(sum2)
        if not denominator:
            return 0.0
        else:
            return float(numerator) / denominator
    
    
    def text_to_vector(text):
        words = WORD.findall(text)
        return Counter(words)
    
    
    x =("A P Moller - Maersk A", "A.P. Moller - Maersk A/S Class A")
    y =("A P Moller - Maersk A", "A.P. Moller - Maersk A/S Class B")
    
    cosine = get_cosine(text_to_vector(x[0]), text_to_vector(x[1]))
    
    print("Cosine1:", cosine)
    
    cosine1 = get_cosine(text_to_vector(y[0]), text_to_vector(y[1]))
    
    print("Cosine2:", cosine1)
    

    Output:

    Cosine1: 0.9091372900969896
    Cosine2: 0.8366600265340756