Search code examples
pythonpandastokenize

How to iterate a function with strings over a pandas dataframe


I want to get the Jaccard Similarity between my dataframe and the base. The issue is I need it for 500+ rows and I either get the error message: "too many values to unpack", 'Series' object has no attribute 'iterrows' or the functions compares the base witht the dataframe as a whole.

Alternative A:

sentences = pd.Series(df.sentence)
sentences = sentences.str.replace('[^A-z ]','').str.replace(' +',' ').str.strip()
splitwords = [ nltk.word_tokenize( str(sentence) ) for sentence in sentences ]
print(splitwords)
sentence = df.sentence
def Jaccard_Similarity(base, sentence):
    for i, row in sentence.iterrows():
        a = set(word for word in base)
        b = set(word for word in df.sentence())
        c = a.intersection(b)
        return(float(len(c)) / (len(a) + len(b) - len(c)), a, b)
Jaccard_Similarity(base, sentence)

Alternative B:

df = df.apply(lambda row: nltk.word_tokenize(row['sentence']), axis=1)
print(df)

def Jaccard_Similarity(bas, df):
    for row in df.iterrows(df):
        a = set(word for word in base)
        b = set(word for word in df)
        c = a.intersection(b)
        return(float(len(c)) / (len(a) + len(b) - len(c)), a, b)
Jaccard_Similarity(base, df)

Data:

base = ['Tom', 'eats', 'apple']    
df = (["Tom eats an apple"],
          ["Tom eats a pineapple"],
          ["Eva eats an apple"],
          ["Eva eats a pineapple"],
         columns = 'sentence')  

EDIT:

   base = set(base.lower().split()) 
   df = set(df.lower().split())

def Jaccard_Similarity(base, df): 
    intersection = base.intersection(df)
    union = base.union(df)
    return float(len(intersection)) / len(union)

Solution

  • Try this - I'll add the explanation later need some work to do.

    import nltk
    from nltk.corpus import stopwords # to remove stopwords
    
    base = ['Tom', 'eats', 'apple']
    base = [item.lower() for item in base]
    stop_words = set(stopwords.words('english')) 
    list1 = [["Tom eats an apple"],
              ["Tom eats a pineapple"],
              ["Eva eats an apple"],
              ["Eva eats a pineapple"]]
    df = pd.DataFrame(list1, columns= ['sentence'])
    df = df.sentence.apply(nltk.word_tokenize)
    df = df.apply(
        lambda x: [item.lower() for item in x if item.lower() not in stop_words]
    )
    b = df.apply(set)
    a = set(base)
    c =  b.apply(lambda x : a.intersection(x))
    len_a_b = b.apply(lambda x : len(x) +  len(a))
    len_c  = c.apply(lambda x : len(x))
    dict1 = {'length' : len_c / (len_a_b - len_c) , 'b' : b , 'c' : c}
    import numpy as np
    df = pd.DataFrame(dict1)
    df['a'] = np.NAN
    df['a'] = df.a.apply(lambda x: a)
    print(df)
    
    

    output -

       length                       b                   c                   a
    0     1.0      {apple, eats, tom}  {apple, eats, tom}  {apple, eats, tom}
    1     0.5  {eats, tom, pineapple}         {eats, tom}  {apple, eats, tom}
    2     0.5      {apple, eats, eva}       {apple, eats}  {apple, eats, tom}
    3     0.2  {eats, pineapple, eva}              {eats}  {apple, eats, tom}