How to calculate cosine similarity, if two sentences have any common word in the form of synonyms. For example,
sent1 = "You are a good coder."
sent2 = "I am new programmer"
Consider coder is synonym of programmer here. Without considering these two specific words as synonym I get a cosine score as zero(0). But considering as synonyms, it should give some cosine value. Please suggest how to approach or try to modify my below sample code. Please consider a custom synonym-dictionary or list instead of any API-based dictionary.
import math
import re
from collections import Counter
WORD = re.compile(r"\w+")
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
synonyms = {"India": "Hindustan",
"USA": "America",}
text2 = "I live in India"
sentences = ["India",
"He belongs to USA",
"Hindustan is synonym of my country name",
"USA and America is same",
"You live in a great country.",
"All countries are great to live",]
cosinetolist = []
for i in sentences:
vector1 = text_to_vector(i)
vector2 = text_to_vector(text2)
cosine = get_cosine(vector1, vector2)
cosinetolist.append((cosine,i,))
l = cosinetolist
print(l)
Try changing each word with its synonymin the sentence beforecomputing thesimilarity as so:
text = ' '.join([w if not w in synonyms else synonyms[w] for w in text.split(' ')])
This splits the sentence, and for each word in the sentence it takes the synonym value if the word is in the keys.
Thus giving:
import math
import re
from collections import Counter
WORD = re.compile(r"\w+")
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
synonyms = {"India": "Hindustan",
"USA": "America",}
def map_synon(text):
return ' '.join([w if not w in synonyms else synonyms[w] for w in text.split(' ')])
text2 = "I live in India"
text2 = map_synon(text2)
sentences = ["He belongs to USA",
"Hindustan is synonym of my country name",
"USA and America is same",
"You live in a great country.",
"All countries are great to live",]
cosinetolist = []
for i in sentences:
vector1 = text_to_vector(map_synon(i))
vector2 = text_to_vector(text2)
cosine = get_cosine(vector1, vector2)
cosinetolist.append((cosine,i,))
l = cosinetolist
print(l)
output:
[(0.0, 'He belongs to USA'),
(0.1889822365046136, 'Hindustan is synonym of my country name'),
(0.0, 'USA and America is same'),
(0.4082482904638631, 'You live in a great country.'),
(0.20412414523193154, 'All countries are great to live')]