Search code examples
pythonpython-3.xregexregex-groupregexp-replace

Extract the year and month from these strings and then replace the patterns found with others


import re, datetime
from calendar import monthrange

#examples:
input_text_substring = "los juegos se jugaran durante el transcurso del mes de octubre" #example 1
input_text_substring = "empiezan durante el transcurso del mes de febrero del año 2020" #example 2
input_text_substring = "empiezan durante el periodo del mes de septiembre" #example 3
input_text_substring = "empezaran durante el transcurso del mes de febrero del 2023" #example 4
input_text_substring = "creo que empezarian durante el transcurso del mes de diciembre 2021" #example 5
es_month_dict = {"enero": "01", "febrero": "02", "marzo": "03", "abril": "04", "mayo": "05", "junio": "06", "julio": "07", "agosto": "08", "septiembre": "09", "octubre": "10", "noviembre": "11", "diciembre": "12"}

#Assumes that it is the current year if it is not explicitly indicated
if not re.search(r"(?:(?:del|de el)[\s|]*(?:año|ano)[\s|]*\d*|.*\d{4}$)", input_text_substring):
    input_text_substring += " de " + datetime.datetime.today().strftime('%Y') + " "

#do substring identification capture groups...
identified_year = #extract year
identified_month = #extract month
last_day_in_this_month = (monthrange(int(identified_year), int(identified_month)))[1]

time_period_in_this_month = "[01 -- " + str(last_day_in_this_month) + "] de " + str(identified_month)

months = r"enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre"
pattern_to_replace = r"(?:(?:en|durante)[\s|]*(?:el|los)[\s|]*(?:transcurso|trancurso|periodo|dias)[\s|]*(?:del|de)[\s|]*(?:mes[\s|]*de|mes)[\s|]*(?:" + months + r")|durante[\s|]*(?:el[\s|]*mes[\s|]*de|el[\s|]*mes|)[\s|]*(?:" + months + r"))"

#do the replacement...
input_text_substring = re.sub(pattern_to_replace, time_period_in_this_month, input_text_substring)


print(repr(input_text_substring)) #output

The correct outputs that this will need to get in each of the examples:

input_text_substring = "los juegos se jugaran [01 -- 31] 10 2022" #example 1
input_text_substring = "empiezan [01 -- 29] 02 2020" #example 2
input_text_substring = "empiezan [01 -- 30] 09 2022" #example 3
input_text_substring = "empezaran [01 -- 28] 02 2023" #example 4
input_text_substring = "creo que empezarian [01 -- 31] 12 2021" #example 5

How should I extract the month and year, to be able to pass it to the (monthrange(int(identified_year), int(identified_month)) method and have it return the number of days of that month in that year, and then replace it in the original string and obtain these outputs?


Solution

  • To find the month, you can simply search for the dictionary term in the string, then to find the year, you can use regex to extract the year from your string.

    Example:

    for key, value in s_month_dict.items():
        if key in input_text_substring:
            identified_month = value
            break
    wordList = re.findall(r'\b\S*%s\S*\b' % re.escape('20'), input_text_substring)
    for word in wordList:
        if len(word) == 4:
            identified_year = word
            break
    print identified_month, ' ', identified_year    
    

    The preceding code produces the following outputs:

    10   2022
    02   2020
    09   2022
    02   2023
    12   2021