I want to get the Jaccard Similarity between my dataframe and the base. The issue is I need it for 500+ rows and I either get the error message: "too many values to unpack", 'Series' object has no attribute 'iterrows' or the functions compares the base witht the dataframe as a whole.
Alternative A:
sentences = pd.Series(df.sentence)
sentences = sentences.str.replace('[^A-z ]','').str.replace(' +',' ').str.strip()
splitwords = [ nltk.word_tokenize( str(sentence) ) for sentence in sentences ]
print(splitwords)
sentence = df.sentence
def Jaccard_Similarity(base, sentence):
for i, row in sentence.iterrows():
a = set(word for word in base)
b = set(word for word in df.sentence())
c = a.intersection(b)
return(float(len(c)) / (len(a) + len(b) - len(c)), a, b)
Jaccard_Similarity(base, sentence)
Alternative B:
df = df.apply(lambda row: nltk.word_tokenize(row['sentence']), axis=1)
print(df)
def Jaccard_Similarity(bas, df):
for row in df.iterrows(df):
a = set(word for word in base)
b = set(word for word in df)
c = a.intersection(b)
return(float(len(c)) / (len(a) + len(b) - len(c)), a, b)
Jaccard_Similarity(base, df)
Data:
base = ['Tom', 'eats', 'apple']
df = (["Tom eats an apple"],
["Tom eats a pineapple"],
["Eva eats an apple"],
["Eva eats a pineapple"],
columns = 'sentence')
EDIT:
base = set(base.lower().split())
df = set(df.lower().split())
def Jaccard_Similarity(base, df):
intersection = base.intersection(df)
union = base.union(df)
return float(len(intersection)) / len(union)
Try this - I'll add the explanation later need some work to do.
import nltk
from nltk.corpus import stopwords # to remove stopwords
base = ['Tom', 'eats', 'apple']
base = [item.lower() for item in base]
stop_words = set(stopwords.words('english'))
list1 = [["Tom eats an apple"],
["Tom eats a pineapple"],
["Eva eats an apple"],
["Eva eats a pineapple"]]
df = pd.DataFrame(list1, columns= ['sentence'])
df = df.sentence.apply(nltk.word_tokenize)
df = df.apply(
lambda x: [item.lower() for item in x if item.lower() not in stop_words]
)
b = df.apply(set)
a = set(base)
c = b.apply(lambda x : a.intersection(x))
len_a_b = b.apply(lambda x : len(x) + len(a))
len_c = c.apply(lambda x : len(x))
dict1 = {'length' : len_c / (len_a_b - len_c) , 'b' : b , 'c' : c}
import numpy as np
df = pd.DataFrame(dict1)
df['a'] = np.NAN
df['a'] = df.a.apply(lambda x: a)
print(df)
output -
length b c a
0 1.0 {apple, eats, tom} {apple, eats, tom} {apple, eats, tom}
1 0.5 {eats, tom, pineapple} {eats, tom} {apple, eats, tom}
2 0.5 {apple, eats, eva} {apple, eats} {apple, eats, tom}
3 0.2 {eats, pineapple, eva} {eats} {apple, eats, tom}