I would like to match a string ( n-gram) in a text, with a way to get offsets with it :
string_to_match = "many workers are very underpaid"
text = "The new york times claimed in a report that many workers are very underpaid in some africans countries."
so as result I want to get a tuple like this ("matched", 44, 75)
where 44 is the start and 75 is the end occurrence.
here is the code I have build, but it works only for unigram.
def extract_offsets(line, _len=len):
words = line.split()
index = line.index
offsets = []
append = offsets.append
running_offset = 0
for word in words:
word_offset = index(word, running_offset)
word_len = _len(word)
running_offset = word_offset + word_len
append(("matched", word_offset, running_offset - 1))
return offsets
def get_entities(offsets):
entities = []
for elm in offsets:
if elm[0] == "string_to_match": # here string_to_match is only one word
entities.append(elm)
return entities
offsets = extract_offsets(text)
entities = get_entities(offsets) # [("matched", start, end)]
any tips to make that work for sequence of strings or n-grams!!
You can re.finditer()
and call span()
method on the matched object to get the beginning and the ending indices of the matched substring-
def m():
string_to_match = "many workers are very underpaid"
text = "The new york times claimed in a report that many workers are very underpaid in some africans countries."
m = re.finditer(r'%s'%(string_to_match),text)
for x in m:
print x.group(0), x.span() # x.span() will return the beginning and the ending indices of the matched substring as a tuple