import regex
product_detail = "yyy target1 target2 xxx".lower()
p1 = r"\btarget1\b|\btarget1 target2\b"
p2 = r"\btarget2\b|\btarget1 target2\b"
for pattern in [p1, p2]:
matches = regex.findall(pattern, product_detail, overlapped=True)
print(matches)
why does matches from p1 only give ['target1']
as output, without 'target1 target2'
but matches from p2 can successfully give ['target1 target2', 'target2']
as output.
Also if you can provide a fix, how do i generalise it? i have a list of 10000 target words and its not going to be feasible to hardcode them.
Here is an example of what I had in mind with my comment on building a list of patterns separating common prefixes:
import regex # I'm actually using re (don't have regex)
product_detail = "yyy target1 target2 xxx".lower()
keywords = ["target1","target2","target1 target2","target3"]
from itertools import accumulate, groupby, zip_longest
keywords.sort()
groups = accumulate(keywords,lambda g,k:g if k.startswith(g) else k)
patterns = ( g for _,(*g,) in groupby(keywords,lambda _:next(groups)) )
patterns = ( filter(None,g) for g in zip_longest(*patterns) )
patterns = [r"\b" + r"\b|\b".join(g) + r"\b" for g in patterns]
# [r'\btarget1\b|\btarget2\b|\btarget3\b', r'\btarget1 target2\b']
for pattern in patterns:
matches = regex.findall(pattern, product_detail)
print(matches)
output:
['target1', 'target2']
['target1 target2']