Could you please explain why the defaultdict function does not define verbs in the code following? Googling tells me that the syntax is fine. Please help! Thanks in advance :)
import spacy
from collections import defaultdict
nlp = spacy.load("en_core_web_sm")
!pip install pandas==0.24.2 --user
import pandas as pd
def calculate_the_word_types(df):
verbs = defaultdict(calculate_the_word_types)
for i, row in df.iterrows():
doc = nlp(row["text"])
for v in map(lambda x: x.lemma_, filter(lambda x: (x.pos_ == 'AUX') | (x.pos_ == 'VERB'), doc)):
verbs[v] += 1
df.at(i, "nr_verb", len(list(map(lambda x: x.text,
filter(lambda x: (x.pos_ == 'AUX') | (x.pos_ == 'VERB'), doc)))))
return df
verbs
NameError Traceback (most recent call last)
<ipython-input-32-7e7c626bb331> in <module>()
13
14 for v in map(lambda x: x.lemma_, filter(lambda x: (x.pos_ == 'AUX') | (x.pos_ == 'VERB'), doc)):
---> 15 verbs[v] += 1
16 df.at(i, "nr_verb", len(list(map(lambda x: x.text,
17 filter(lambda x: (x.pos_ == 'AUX') | (x.pos_ == 'VERB'), doc)))))
NameError: name 'verbs' is not defined
set_value()
function is deprecated.
As replacement, you can use ".at" like this: df.at["YOURINDEX", "YOURCOLUMN"] = "YOURVALUE".
Also, you have issue on this line verbs = defaultdict(calculate_the_word_types)
initialize with 0 since it will act as counter.
And fix your indentation
working code -
import spacy
from collections import defaultdict
nlp = spacy.load("en_core_web_sm")
!pip install pandas==0.24.2 --user
import pandas as pd
def calculate_the_word_types(df):
verbs = defaultdict(lambda: 0)
# count all tokens, but not the punctuations
for i, row in df.iterrows():
doc = nlp(row["text"])
# count only the verbs
for v in map(lambda x: x.lemma_, filter(lambda x: (x.pos_ == 'AUX') | (x.pos_ == 'VERB'), doc)):
verbs[v] += 1
df.at[i, "nr_verb"] = len(list(map(lambda x: x.text,
filter(lambda x: (x.pos_ == 'AUX') | (x.pos_ == 'VERB'), doc))))
return df
# dataframe
df = pd.DataFrame({'text':['hello there', 'I love Tatooine', 'I hate sands']})
# print the dataframe with verb count
print(calculate_the_word_types(df))
Output -
text nr_verb
0 hello there 0.0
1 I love Tatooine 1.0
2 I hate sands 1.0