I just got a hold of Google's word2vec model and am quite new to the concept. i am trying to extract the main feature of a paragraph using the following method.
from gensim.models.keyedvectors import KeyedVectors
model = KeyedVectors.load_word2vec_format('../../usr/myProject/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
...
for para in paragraph_array:
para_name = "para_"+ file_name + '{0}'
sentence_array = d[para_name.format(number_of_paragraphs)] = []
# Split Paragraph on basis of '.' or ? or !.
for l in re.split(r"\.|\?|\!", para):
# Split line into list using space.
sentence_array.append(l)
#sentence_array.append(l.split(" "))
print (model.wv.most_similar(positive=para, topn = 1))
But am getting the following error where it says that the paragraph checked is not a word in the vocabulary.
KeyError: 'word \'The Republic of Ghana is a country in West Africa. It borders Côte d\'Ivoire (also known as Ivory Coast) to the west, Burkina Faso to the north, Togo to the east, and the Gulf of Guinea to the south. The word "Ghana" means "Warrior King", Jackson, John G. Introduction to African Civilizations, 2001. Page 201. and was the source of the name "Guinea" (via French Guinoye) used to refer to the West African coast (as in Gulf of Guinea).\' not in vocabulary'
Now I am aware that the most_similar()
function expects a single array. But I would like to know how this can be translated to extract one main feature or word that displays the main concept of the paragraph using the word2vec model.
Modified
I modified the above code to pass the word_array into the most_similar()
method and I' getting the following error.
Traceback (most recent call last): File "/home/manuelanayantarajeyaraj/PycharmProjects/ChatbotWord2Vec/new_approach.py", line 108, in print(model.wv.most_similar(positive=word_array, topn=1)) File "/home/manuelanayantarajeyaraj/usr/myProject/my_project/lib/python3.5/site-packages/gensim/models/keyedvectors.py", line 361, in most_similar for word, weight in positive + negative: ValueError: too many values to unpack (expected 2)
Modified Implementation
for sentence in sentence_array:
if sentence:
for w in re.split(r"\.|\?|\!|\@|\#|\$|\%|\^|\&|\*|\(|\)|\-", sentence):
split_word = w.split(" ")
if split_word:
word_array.append(split_word)
print(model.wv.most_similar(positive=word_array, topn=1))
Any suggestions in this regard are much appreciated.
I reworked the entire code adding checkpoints to avoid the storage of empty strings to objects at each level starting from paragraphs, sentences down to words.
Working Version
for file_name in files:
file_identifier = file_name
file_array = file_dictionary[file_identifier] =[]
#file_array = file_dictionary[file_name.format((file_count))] = []
file_path = directory_path+'/'+file_name
with open(file_path) as f:
#Level 2 Intents : Each file's main intent (One for each file)
first_line = f.readline()
print ()
print("Level 2 Intent for ", c, " : ", first_line)
#Level 3 Intents : Each paragraph's main intent (one for each para)
paragraph_count = 0
data = f.read()
splat = data.split("\n")
paragraph_array = []
for number, paragraph in enumerate(splat, 1):
paragraph_identifier = file_name + "_paragraph_" + str(paragraph_count)
#print(paragraph_identifier)
paragraph_array = paragraph_dictionary[paragraph_identifier.format(paragraph_count)] = []
if paragraph :
paragraph_array.append(paragraph)
paragraph_count += 1
if len(paragraph_array) >0 :
file_array.append(paragraph_array)
# Level 4 Intents : Each sentence's main intent (one for each sentence)
sentence_count = 0
sentence_array = []
for sentence in paragraph_array:
for line in re.split(r"\.|\?|\!", sentence):
sentence_identifier = paragraph_identifier + "_sentence_" + str(sentence_count)
sentence_array = sentence_dictionary[sentence_identifier.format(sentence_count)] = []
if line :
sentence_array.append(line)
sentence_count += 1
# Level 5 Intents : Each word with a certain level of prominance (one for each prominant word)
word_count = 0
word_array = []
for words in sentence_array:
for word in re.split(r" ", words):
word_identifier = sentence_identifier + "_word_" + str(word_count)
word_array = word_dictionary[word_identifier.format(word_count)] = []
if word :
word_array.append(word)
word_count += 1
Code to access dictionary items
#Accessing any paragraph array can be done as follows
print (paragraph_dictionary['S08_set4_a5.txt.clean_paragraph_4'])
#Accessing any sentence corresponding to a paragraph
print (sentence_dictionary['S08_set4_a5.txt.clean_paragraph_4_sentence_1'])
#Accessing any word corresponding to a sentence
print (word_dictionary['S08_set4_a5.txt.clean_paragraph_4_sentence_1_word_3'])
Output
['Celsius was born in Uppsala in Sweden. He was professor of astronomy at Uppsala University from 1730 to 1744, but traveled from 1732 to 1735 visiting notable observatories in Germany, Italy and France.']
[' He was professor of astronomy at Uppsala University from 1730 to 1744, but traveled from 1732 to 1735 visiting notable observatories in Germany, Italy and France']
['of']