I am comparing sentences with jaccard similarity in Python.
However, I have a question for the intersection function:
import itertools
import pandas as pd
item1='She went to a restaurant on Oxford Street'.split(' ')
item2='She went to an Italian restaurant on Oxf. Street'.split(' ')
set.intersection(*[set(item1), set(item2)])
Result :
{'She', 'Street', 'on', 'restaurant', 'to', 'went'}
It only calculates intersection for strings that are completely identical, is there a way to include the word Oxf also, since it is short for Oxford? I.e., if a substring exists in the other set it should be included in the intersection.
#!/usr/bin/env python
from pprint import pprint
def jaccard_similarity(str1, str2,
exclude_words=set(["a", "an", "at", "in", "on", "is", "of", "with", "from", "to", "went"])):
# Remove all periods "." to reduce later computation (num alias comparisons).
x,y = [set(v.replace(".", "").split(" ")) - exclude_words for v in (str1, str2)]
all_words = x | y
word_aliases = dict()
min_alias_length = 3
for word in all_words:
aliases = set([word[:i] for i in range(min_alias_length, len(word))])
if aliases:
word_aliases[word] = aliases
print("Word aliases:")
insert_aliases_for_words = {}
for word, aliases in word_aliases.items():
for words in (x, y):
aliases_in_words = aliases & words
if aliases_in_words:
# Remove all aliases and replace with single "original" word.
words -= aliases_in_words
print("Unaliased word sets:")
intersection = x & y
intersection_cardinality = len(intersection)
union_cardinality = len(x|y)
return intersection_cardinality/float(union_cardinality)
import itertools
item1 = 'She went to a restaurant on Oxford Street'
item2 = 'She went to an Italian restaurant on Oxf. Street'
result = jaccard_similarity(item1, item2)
Word aliases:
{'Italian': {'Italia', 'Ital', 'Itali', 'Ita'},
'Oxford': {'Oxfo', 'Oxf', 'Oxfor'},
'Street': {'Stree', 'Stre', 'Str'},
'restaurant': {'res',
Unaliased word sets:
{'Oxford', 'Street', 'restaurant', 'She'}
{'She', 'restaurant', 'Oxford', 'Italian', 'Street'}
{'Oxford', 'She', 'restaurant', 'Street'}