Search code examples
pythonsetpython-itertoolsintersection

How to find intersection of two sentences and include substrings


I am comparing sentences with jaccard similarity in Python.

However, I have a question for the intersection function:

import itertools
import pandas as pd
item1='She went to a restaurant on Oxford Street'.split(' ')
item2='She went to an Italian restaurant on Oxf. Street'.split(' ')

set.intersection(*[set(item1), set(item2)])

Result :

{'She', 'Street', 'on', 'restaurant', 'to', 'went'}

It only calculates intersection for strings that are completely identical, is there a way to include the word Oxf also, since it is short for Oxford? I.e., if a substring exists in the other set it should be included in the intersection.


Solution

  • #!/usr/bin/env python
    
    from pprint import pprint
    
    def jaccard_similarity(str1, str2,
            exclude_words=set(["a", "an", "at", "in", "on", "is", "of", "with", "from", "to", "went"])):
    
        # Remove all periods "." to reduce later computation (num alias comparisons).
        x,y = [set(v.replace(".", "").split(" ")) - exclude_words for v in (str1, str2)]
    
        all_words = x | y
        word_aliases = dict()
        min_alias_length = 3
    
        for word in all_words:
            aliases = set([word[:i] for i in range(min_alias_length, len(word))])
            if aliases:
                word_aliases[word] = aliases
    
        print("Word aliases:")
        pprint(word_aliases)
    
        insert_aliases_for_words = {}
        for word, aliases in word_aliases.items():
            for words in (x, y):
                aliases_in_words = aliases & words
                if aliases_in_words:
                    # Remove all aliases and replace with single "original" word.
                    words -= aliases_in_words
                    words.add(word)
    
        print("Unaliased word sets:")
        print(x)
        print(y)
    
        intersection = x & y
    
        print("Intersection:")
        print(intersection)
    
        intersection_cardinality = len(intersection)
        union_cardinality = len(x|y)
        return intersection_cardinality/float(union_cardinality)
    
    
    import itertools
    item1 = 'She went to a restaurant on Oxford Street'
    item2 = 'She went to an Italian restaurant on Oxf. Street'
    
    result = jaccard_similarity(item1, item2)
    print(result)
    

    Output:

    Word aliases:
    {'Italian': {'Italia', 'Ital', 'Itali', 'Ita'},
     'Oxford': {'Oxfo', 'Oxf', 'Oxfor'},
     'Street': {'Stree', 'Stre', 'Str'},
     'restaurant': {'res',
                    'rest',
                    'resta',
                    'restau',
                    'restaur',
                    'restaura',
                    'restauran'}}
    Unaliased word sets:
    {'Oxford', 'Street', 'restaurant', 'She'}
    {'She', 'restaurant', 'Oxford', 'Italian', 'Street'}
    Intersection:
    {'Oxford', 'She', 'restaurant', 'Street'}
    0.8