I'm working on the sentence Hall is a Tony Award winner and Grammy nominee
and would like to extract the awards won (Tony Award
), using spaCy Rule-Matcher, but I can't seem to be able to tell spaCy to look for words that come before winner
. Is that possible? If so how could one go about it?
nlp = en_core_web_sm.load()
awards_lexical = [
{'TEXT': {'REGEX': '\s*'}, 'OP': '*'},
{'IS_PUNCT': True, 'OP': '*'},
{'TEXT': {'REGEX': '^(winner|recipient)$'}},
{'OP': '+'},
]
def matching(doc, pattern):
result = []
for sent in doc.sents:
matcher = Matcher(nlp.vocab)
matcher.add("matching", None, pattern)
matches = matcher(nlp(str(sent)))
if len(matches)>0:
match = matches[-1]
span = sent[match[1]:match[2]]
result.append(span.text)
return result
csv_reader = csv.reader(open('Matheus_Schmitz_hw02_bios.csv', encoding='utf-8'))
limit = 500
count = 0
open("hw2_lexical.jl", "w").close()
with open('hw2_lexical.jl', 'w') as hw2_lexical:
for (idx, (url, bio)) in tqdm(enumerate(csv_reader), total=limit):
count += 1
result = {}
result['url'] = url
result['awards'] = matching(nlp(bio), awards_lexical)
hw2_lexical.write(str(result)+'\n')
if count>=limit:
break
pass
hw2_lexical.close()
print(count)
From my code, I'd think that spaCy would include any text before the chosen word, but all variations I've are only giving me the text from winner|won|awarded onwards, not the text before, which is where the prize name most often is.
Your idea seems valid, you may extract one or more capitalized words followed with winner
or recipient
using
import spacy
from spacy.matcher import Matcher
text= "Hall is a Tony Award winner and Grammy nominee"
nlp = spacy.load("en_core_web_lg")
matcher = Matcher(nlp.vocab)
matcher.add("Winner", None, [{'POS': 'PROPN', 'OP':'+'}, {'TEXT': {'REGEX': '(?i)^(?:winner|recipient)$'}}])
doc = nlp(text)
matches = matcher(doc)
spans = [doc[start:end] for _, start, end in matches]
for span in spacy.util.filter_spans(spans):
print(span.text)
# => Tony Award winner
The (?i)^(?:winner|recipient)$
regex used as the right-hand token in the pattern matches a whole winner
or recipient
token in a case insensitive way.