The idea is that I'm using Named Entity Recognition (NER) on a tokenised text which is also tagged.
def make_tag_lists(sents):
tokens=[]
pos=[]
ner=[]
for sent in sents:
for t in sent:
tokens.append(t.text)
pos.append(t.pos_)
ner.append(t.ent_type_)
return tokens,pos,ner
tokens,pos,ner = make_tag_lists(sample)
def extract_entities(tokenlist,taglist,tagtype):
entities={}
inentity=False
for i,(token,tag) in enumerate(zip(tokenlist,taglist)):
if tag==tagtype:
if inentity:
entity+=" "+token
else:
entity=token
inentity=True
elif inentity:
entities[entity]=entities.get(entity,0)+1
inentity=False
return entities
people=extract_entities(tokens,ner,"PERSON")
top_people=sorted(people.items(),key=operator.itemgetter(1),reverse=True)[:20]
print(top_people)
What I should be receiving is the top 20 most commonly referred to people in a list, though my output is currently an empty list. There are no syntax errors and not sure where I've gone wrong.
I suggest you trying to skip the first block of your code and check the remaining execution flow.
# tokens,pos,ner = make_tag_lists(sample)
tokens = ['Hi','FOO','BAR',"it's",'ME']
ner =['MISC','PERSON','PERSON','MISC','PERSON']
def extract_entities(tokenlist,taglist,tagtype):
entities={}
inentity=False
for i,(token,tag) in enumerate(zip(tokenlist,taglist)):
if tag==tagtype:
if inentity:
entity+=" "+token
else:
entity=token
inentity=True
elif inentity:
entities[entity]=entities.get(entity,0)+1
inentity=False
return entities
people=extract_entities(tokens,ner,"PERSON")
top_people=sorted(people.items(),key=operator.itemgetter(1),reverse=True)[:20]
print(top_people)
The outcome of this example is [('FOO BAR', 1)]
.
Furthermore, please notice that you are missing the last PERSON entity because it is not added to the entities
dictionary.