Search code examples
pythonnlppython-recosine-similarity

Python: Cosine similarity between sentences with synonyms


How to calculate cosine similarity, if two sentences have any common word in the form of synonyms. For example,

sent1 = "You are a good coder."

sent2 = "I am new programmer"

Consider coder is synonym of programmer here. Without considering these two specific words as synonym I get a cosine score as zero(0). But considering as synonyms, it should give some cosine value. Please suggest how to approach or try to modify my below sample code. Please consider a custom synonym-dictionary or list instead of any API-based dictionary.

import math
import re
from collections import Counter

WORD = re.compile(r"\w+")    
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator    
def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

synonyms = {"India": "Hindustan",
            "USA": "America",}    
text2 = "I live in India"    
sentences = ["India",
            "He belongs to USA", 
            "Hindustan is synonym of my country name",
            "USA and America is same",
            "You live in a great country.",
            "All countries are great to live",]    
cosinetolist = []
for i in sentences:
    vector1 = text_to_vector(i)
    vector2 = text_to_vector(text2) 
    cosine = get_cosine(vector1, vector2)
    cosinetolist.append((cosine,i,))
l = cosinetolist
print(l)

Solution

  • Try changing each word with its synonymin the sentence beforecomputing thesimilarity as so:

    text = ' '.join([w if not w in synonyms  else synonyms[w] for w in text.split(' ')])
    

    This splits the sentence, and for each word in the sentence it takes the synonym value if the word is in the keys.

    Thus giving:

    import math
    import re
    from collections import Counter
    
    WORD = re.compile(r"\w+")    
    def get_cosine(vec1, vec2):
        intersection = set(vec1.keys()) & set(vec2.keys())
        numerator = sum([vec1[x] * vec2[x] for x in intersection])
    
        sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
        sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
        denominator = math.sqrt(sum1) * math.sqrt(sum2)
    
        if not denominator:
            return 0.0
        else:
            return float(numerator) / denominator    
    def text_to_vector(text):
        words = WORD.findall(text)
        return Counter(words)
    
    synonyms = {"India": "Hindustan",
                "USA": "America",}    
    
    def map_synon(text):
        return ' '.join([w if not w in synonyms  else synonyms[w] for w in text.split(' ')])
    text2 = "I live in India"
    text2 = map_synon(text2)
    
    sentences = ["He belongs to USA", 
                "Hindustan is synonym of my country name",
                "USA and America is same",
                "You live in a great country.",
                "All countries are great to live",]    
    cosinetolist = []
    
    
    for i in sentences:
        vector1 = text_to_vector(map_synon(i))
        vector2 = text_to_vector(text2) 
        cosine = get_cosine(vector1, vector2)
        cosinetolist.append((cosine,i,))
    l = cosinetolist
    print(l)
    

    output:

    [(0.0, 'He belongs to USA'),
    (0.1889822365046136, 'Hindustan is synonym of my country name'),
    (0.0, 'USA and America is same'),
    (0.4082482904638631, 'You live in a great country.'), 
    (0.20412414523193154, 'All countries are great to live')]