I am novice to NLP to be honest and I am trying to use GLOVE vectors for finding the similarity between two statements and I am getting a key error. Please let me know where I am wrong. Thanks in advance for your help and if there are other better ways of measuring the similarity between the statements,please let me know.
gloveFile = "/content/glove.6B.50d.txt"
import numpy as np
def loadGloveModel(gloveFile):
print ("Loading Glove Model")
with open(gloveFile, encoding="utf8" ) as f:
content = f.readlines()
print(content)
model = {}
for line in content:
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
print ("Done.",len(model)," words loaded!")
return model
import re
from nltk.corpus import stopwords
import pandas as pd
def preprocess(raw_text):
# keep only words
letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)
# convert to lower case and split
words = letters_only_text.lower().split()
# remove stopwords
stopword_set = set(stopwords.words("english"))
cleaned_words = list(set([w for w in words if w not in stopword_set]))
return cleaned_words
def cosine_distance_wordembedding_method(s1, s2):
import scipy
vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')
model = loadGloveModel(gloveFile)
for i in list121:
cosine_distance_wordembedding_method(str4,i)
And then I got the error like:
<ipython-input-54-d463b41223c3> in cosine_distance_wordembedding_method(s1, s2)
36 import scipy
37 vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
---> 38 vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
39 cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
40 print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')
<ipython-input-54-d463b41223c3> in <listcomp>(.0)
36 import scipy
37 vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
---> 38 vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
39 cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
40 print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')
KeyError: 'vehcile'
I have found my mistake and I am just keeping this question so that somebody may get help. The mistake I did is I have typed a wrong spelling like "Vehcile" instead of "vehicle".