I want to extract contract length from text to term in months. Free text fields range from:
"2 x 5 year terms",
"3 further x 4 years",
"two(2) further terms of five(5) years each",
"Two (2) Years + Two (2) Years + Two (2) Years",
"1 years + 1 years + 1 years" ,
"2 x 3 years",
"1 year and 6 months",
"
I'd like the output to be:
120 months,
144 months,
120 months,
72 months,
36 months
72 months
18 months
import re
def calculate_duration(term):
term = term.lower()
# Handle "x year terms" pattern
match = re.match(r'(\d+) x (\d+) year terms?', term)
if match:
return int(match.group(1)) * int(match.group(2)) * 12
# Handle "FURTHER TERMS OF x YEARS EACH" pattern
match = re.match(r'further terms of (\d+) years each', term)
if match:
return int(match.group(1)) * 12
# Handle "FURTHER TERMS OF x YEARS EACH" pattern
match = re.match(r'further terms of (\d+) years each', term)
if match:
return int(match.group(1)) * 12
# Handle "FURTHER TERMS OF x YEARS EACH" pattern
match = re.match(r'further terms of ((?:\d+\s?\(\w+\)\s?)?(\d+)) years each', term)
if match:
return int(match.group(2)) * 12
# Handle "x years + x years + x years" pattern
match = re.match(r'(\d+) years(\s?\+\s?\d+ years)+', term)
if match:
return sum(int(match.group(1)) for group in match.groups()) * 12
# Handle other patterns or simple year counts
match = re.match(r'(\d+) years?', term)
if match:
return int(match.group(1)) * 12
# Handle other cases or unknown patterns
return None
# Example usage
terms = [
"2 x 5 year terms",
"3 further x 4 YEAR terms",
"Two (2) Years + Two (2) Years + Two (2) Years",
"1 years + 1 years + 1 years" ,
"2 x 3 years"
]
for term in terms:
duration = calculate_duration(term)
print(f"{term}: {duration} months")
"... I want to extract contract length from text to term in months. ..."
Utilize the eval built-in function.
Traverse the the text, appending the according values; numbers and operators.
When a "year" value is encountering, adjust the previous value accordingly; multiply by 12.
From here, produce a mathematical expression, by concatenating the values.
Here is an example.
import re
def parse(s: str):
e = []
for i, x in enumerate(s.split()):
if any([c.isdigit() for c in x]):
e.append(int(re.sub(r'\D', '', x)))
elif 'year' in x.lower(): e[-1] *= 12
elif x in ['x', 'of']: e.append('*')
elif x in ['+', 'and']: e.append('+')
return e
text = ['2 x 5 year terms',
'3 further x 4 years',
'two(2) further terms of five(5) years each',
'Two (2) Years + Two (2) Years + Two (2) Years',
'1 years + 1 years + 1 years',
'2 x 3 years',
'1 year and 6 months']
for string in text:
exp = ' '.join(map(str, parse(string)))
print(exp, '=', eval(exp))
Output
2 * 60 = 120
3 * 48 = 144
2 * 60 = 120
24 + 24 + 24 = 72
12 + 12 + 12 = 36
2 * 36 = 72
12 + 6 = 18
Edit
A similar approach would be to remove any non-associative values, i.e., keeping only "years", "of", "and", digits, and the characters, 'x', and '+'.
import re
a = re.compile(r'(?i)\b(?!(?:years?)|of|and|\d|x|\+)\w+|[()]')
b = re.compile(r'(?<!\S)[ \t]+| +$')
def parse(s: str):
global a, b
s = a.sub('', s).lower()
s = b.sub('', s).replace('s', '')
e = []
for i, x in enumerate(s.split()):
if x.isdigit(): e.append(int(x))
elif x == 'year': e[-1] *= 12
elif x in ['x', 'of']: e.append('*')
elif x in ['+', 'and']: e.append('+')
return ' '.join(map(str, e))
text = ['2 x 5 year terms',
'3 further x 4 years',
'two(2) further terms of five(5) years each',
'Two (2) Years + Two (2) Years + Two (2) Years',
'1 years + 1 years + 1 years',
'2 x 3 years',
'1 year and 6 months']
for string in text:
exp = parse(string)
print(exp, '=', eval(exp))