def find_valid_dates(dt):
result = re.findall("\d{1,2}-\d{2}-\d{2,4}|\d{1,2}\s(?:januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december)\s\d{1,4}", dt)
return result
SaaOne_msi_vervangen['valid_dates'] = SaaOne_msi_vervangen['Oplossingstekst'].apply(lambda dt : find_valid_dates(dt))
test = SaaOne_msi_vervangen['valid_dates'].date.apply(lambda str_list : [item.dt.strftime("%d-%m-%Y") for item in eval(str_list)])
I try to achieve the following:
Here is some data example:
SaaOne_msi_vervangen = pd.DataFrame({"valid_dates": [['8-10-2019', '08-10-2019', '08-10-2019', '09-10-2019', '09-10-2019', '09-10-19', '21-10-19', '23-10-19', '23-10-2019', '23-10-2019', '23-10-2019', '23-10-2019', '24-10-19', '23 oktober 2019', '23 oktober 2019', '23 oktober 2019'],['31-10-19', '19-11-01', '06-11-19', '29-11-2019', '03-12-19', '03-12-19', '5-12-2019', '04-12-19', '05-12-2019', '05-12-2019', '05-12-2019', '05-12-2019', '10-12-19', '5 december 2019']]})
You can try:
m = {
'januari': 'jan',
'februari': 'feb',
'maart': 'mar',
'april': 'apr',
'mei': 'may',
'juni': 'jun',
'juli': 'jul',
'augustus': 'aug',
'september': 'sep',
'oktober': 'oct',
'november': 'nov',
'december': 'dec',
}
pat = '|'.join(m)
SaaOne_msi_vervangen = SaaOne_msi_vervangen.explode('valid_dates')
SaaOne_msi_vervangen['valid_dates'] = SaaOne_msi_vervangen['valid_dates'].str.replace(pat, lambda g: m[g[0]], regex=True)
SaaOne_msi_vervangen['valid_dates'] = pd.to_datetime(SaaOne_msi_vervangen['valid_dates'], dayfirst=True)
print(SaaOne_msi_vervangen)
Prints:
valid_dates
0 2019-10-08
0 2019-10-08
0 2019-10-08
0 2019-10-09
0 2019-10-09
0 2019-10-09
0 2019-10-21
0 2019-10-23
0 2019-10-23
0 2019-10-23
0 2019-10-23
0 2019-10-23
0 2019-10-24
0 2019-10-23
0 2019-10-23
0 2019-10-23
1 2019-10-31
1 2001-11-19
1 2019-11-06
1 2019-11-29
1 2019-12-03
1 2019-12-03
1 2019-12-05
1 2019-12-04
1 2019-12-05
1 2019-12-05
1 2019-12-05
1 2019-12-05
1 2019-12-10
1 2019-12-05
To get back to a list form, then:
SaaOne_msi_vervangen = SaaOne_msi_vervangen.groupby(level=0).agg(list)
print(SaaOne_msi_vervangen)
Prints:
valid_dates
0 [2019-10-08 00:00:00, 2019-10-08 00:00:00, 2019-10-08 00:00:00, 2019-10-09 00:00:00, 2019-10-09 00:00:00, 2019-10-09 00:00:00, 2019-10-21 00:00:00, 2019-10-23 00:00:00, 2019-10-23 00:00:00, 2019-10-23 00:00:00, 2019-10-23 00:00:00, 2019-10-23 00:00:00, 2019-10-24 00:00:00, 2019-10-23 00:00:00, 2019-10-23 00:00:00, 2019-10-23 00:00:00]
1 [2019-10-31 00:00:00, 2001-11-19 00:00:00, 2019-11-06 00:00:00, 2019-11-29 00:00:00, 2019-12-03 00:00:00, 2019-12-03 00:00:00, 2019-12-05 00:00:00, 2019-12-04 00:00:00, 2019-12-05 00:00:00, 2019-12-05 00:00:00, 2019-12-05 00:00:00, 2019-12-05 00:00:00, 2019-12-10 00:00:00, 2019-12-05 00:00:00]