Search code examples
pythonregexnlpspacy

Spacy find text before specified word


I'm working on the sentence Hall is a Tony Award winner and Grammy nominee and would like to extract the awards won (Tony Award), using spaCy Rule-Matcher, but I can't seem to be able to tell spaCy to look for words that come before winner. Is that possible? If so how could one go about it?

nlp = en_core_web_sm.load()

awards_lexical = [
            {'TEXT': {'REGEX': '\s*'}, 'OP': '*'},
            {'IS_PUNCT': True, 'OP': '*'},
            {'TEXT': {'REGEX': '^(winner|recipient)$'}},
            {'OP': '+'},
            ]
def matching(doc, pattern):
    result = []
    for sent in doc.sents:
        matcher = Matcher(nlp.vocab) 
        matcher.add("matching", None, pattern)  

        matches = matcher(nlp(str(sent))) 
        if len(matches)>0:
            match = matches[-1]
            span = sent[match[1]:match[2]] 
            result.append(span.text)

    return result

csv_reader = csv.reader(open('Matheus_Schmitz_hw02_bios.csv', encoding='utf-8'))
limit = 500
count = 0

open("hw2_lexical.jl", "w").close()
with open('hw2_lexical.jl', 'w') as hw2_lexical:
    for (idx, (url, bio)) in tqdm(enumerate(csv_reader), total=limit):
        count += 1
        result = {}
        result['url'] = url
        result['awards'] = matching(nlp(bio), awards_lexical)        
        hw2_lexical.write(str(result)+'\n')
        if count>=limit:
            break
        pass
    hw2_lexical.close()
print(count)

From my code, I'd think that spaCy would include any text before the chosen word, but all variations I've are only giving me the text from winner|won|awarded onwards, not the text before, which is where the prize name most often is.


Solution

  • Your idea seems valid, you may extract one or more capitalized words followed with winner or recipient using

    import spacy
    from spacy.matcher import Matcher
    
    text= "Hall is a Tony Award winner and Grammy nominee"
    nlp = spacy.load("en_core_web_lg")
    matcher = Matcher(nlp.vocab)
    matcher.add("Winner", None, [{'POS': 'PROPN', 'OP':'+'}, {'TEXT': {'REGEX': '(?i)^(?:winner|recipient)$'}}])
    doc = nlp(text)
    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]
    for span in spacy.util.filter_spans(spans):
        print(span.text)
    # => Tony Award winner
    

    The (?i)^(?:winner|recipient)$ regex used as the right-hand token in the pattern matches a whole winner or recipient token in a case insensitive way.