I have a Google Colab notebook from a while ago which uses spacy 2.2.4 and successfully gets the most similar words for a list of words:
import spacy
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()
import numpy as np
import pandas as pd
print(spacy.__version__)
all_search_terms = ["technology", "internet", "smartphone"]
# define a function to get the x most similar words to a word
def most_similar(word, topn=2):
print(word)
word = nlp.vocab[str(word)]
print(word.prob)
queries = [
w for w in word.vocab
if w.is_lower == word.is_lower and w.prob >= -15 and np.count_nonzero(w.vector)
]
by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
return [(w.lower_,w.similarity(word)) for w in by_similarity[:topn+1] if w.lower_ != word.lower_]
# create function to receive a list of words and return the
# top 2 similar words for each word in the list
def get_similar_words(list_of_words):
all_similar_words = []
for word in list_of_words:
spacy_word = nlp.vocab[str(word)]
if spacy_word.has_vector:
# find similar words to the word, and store them in a dataframe along with their scores
similar_words = pd.DataFrame(most_similar(word, topn=2), columns=["word", "similarity_score"])
# save the list of similar words
similar_words_list = list(similar_words["word"])
# append the list of similar words to the list to be returned
all_similar_words.append(similar_words_list)
# flatten the list of lists to one list
all_similar_words = [item for sublist in all_similar_words for item in sublist]
# remove duplicates from the list
all_similar_words = list(dict.fromkeys(all_similar_words))
# sort list in alphabetical order
all_similar_words.sort()
return all_similar_words
# run the function on the search terms entered by the user
new_search_terms = get_similar_words(all_search_terms)
new_search_terms
The output is:
technology
-10.063644409179688
internet
-8.897857666015625
smartphone
-12.11159896850586
['handset', 'online', 'smartphones', 'technological', 'technologies', 'web']
THE PROBLEM: I've just tried running the same code in a different environment within RStudio (i.e. NOT using Google Colab) where the version of spacy is 3.0.6 and the list of similar words (new_search_terms) is empty. I've also noticed that the word probabilities are all the same (-20).
The output with spacy 3.0.6:
technology
-20.0
internet
-20.0
smartphone
-20.0
[]
What do I need to do differently in this new version of spacy to get the same output as before?
The token probabilities are not loaded by default in v3 and so you have to do some stuff to load them.
import spacy
from spacy.lookups import load_lookups
nlp = spacy.load("en_core_web_sm")
lookups = load_lookups("en", ["lexeme_prob"])
nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
After this your code should work, though I am not sure why you are using .prob
here.