Python: Cosine similarity between sentences with synonyms

How to calculate cosine similarity, if two sentences have any common word in the form of synonyms. For example,

sent1 = "You are a good coder."

sent2 = "I am new programmer"

Consider coder is synonym of programmer here. Without considering these two specific words as synonym I get a cosine score as zero(0). But considering as synonyms, it should give some cosine value. Please suggest how to approach or try to modify my below sample code. Please consider a custom synonym-dictionary or list instead of any API-based dictionary.

import math
import re
from collections import Counter

WORD = re.compile(r"\w+")    
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator    
def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

synonyms = {"India": "Hindustan",
            "USA": "America",}    
text2 = "I live in India"    
sentences = ["India",
            "He belongs to USA", 
            "Hindustan is synonym of my country name",
            "USA and America is same",
            "You live in a great country.",
            "All countries are great to live",]    
cosinetolist = []
for i in sentences:
    vector1 = text_to_vector(i)
    vector2 = text_to_vector(text2) 
    cosine = get_cosine(vector1, vector2)
    cosinetolist.append((cosine,i,))
l = cosinetolist
print(l)

Solution

Try changing each word with its synonymin the sentence beforecomputing thesimilarity as so:

text = ' '.join([w if not w in synonyms  else synonyms[w] for w in text.split(' ')])

This splits the sentence, and for each word in the sentence it takes the synonym value if the word is in the keys.

Thus giving:

import math
import re
from collections import Counter

WORD = re.compile(r"\w+")    
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator    
def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

synonyms = {"India": "Hindustan",
            "USA": "America",}    

def map_synon(text):
    return ' '.join([w if not w in synonyms  else synonyms[w] for w in text.split(' ')])
text2 = "I live in India"
text2 = map_synon(text2)

sentences = ["He belongs to USA", 
            "Hindustan is synonym of my country name",
            "USA and America is same",
            "You live in a great country.",
            "All countries are great to live",]    
cosinetolist = []


for i in sentences:
    vector1 = text_to_vector(map_synon(i))
    vector2 = text_to_vector(text2) 
    cosine = get_cosine(vector1, vector2)
    cosinetolist.append((cosine,i,))
l = cosinetolist
print(l)

output:

[(0.0, 'He belongs to USA'),
(0.1889822365046136, 'Hindustan is synonym of my country name'),
(0.0, 'USA and America is same'),
(0.4082482904638631, 'You live in a great country.'), 
(0.20412414523193154, 'All countries are great to live')]