I am trying to find and replace partial month and day in a pandas dataframe with the complete month and date format. But no change in the string
Code
import pandas as pd
data = {'text':['event mon and nov', 'no event on friday', 'december is good', 'welcome jan again']}
df = pd.DataFrame(data)
month = {"jan":"january","feb":"february","mar":"march","apr":"april","may":"may","jun":"june",
"jul":"july","aug":"august","sep":"september","oct":"october","nov":"november","dec":"december"}
day = {"sun":"sunday","mon":"monday","tue":"tuesday","wed":"wednesday","thu":"thursday","fri":"friday",
"sat":"saturday"}
df["text_new"] = df["text"].apply(lambda x : re.compile(r"(?:(?:(?:j|J)an)|(?:(?:f|F)eb)| \
(?:(?:m|M)ar)|(?:(?:a|A)pr)|(?:(?:m|M)ay)|(?:(?:j|J)un)|(?:(?:j|J)ul)|(?:(?:a|A)ug)|(?:(?:s|S)ep)|(?:(?:o|O)ct)| \
(?:(?:n|N)ov)|(?:(?:d|D)ec))(?:\s)".join(month)).sub(lambda m: month.get(m.group()), x))
Expected output dataframe
0 event monday and november
1 no event on friday
2 december is good
3 welcome january again
Thanks in advance
You should use pandas regex capabilities (with str.replace
):
import re
dm = month.copy()
dm.update(day)
# or for python ≥3.9
# dm = day|month
regex = fr'\b({"|".join(dm)})\b'
# if match, replace with dict value
df["text_new"] = df['text'].str.replace(regex, lambda x: dm.get(x.group(), x),
regex=True, # use regex
flags=re.I) # case insensitive
output:
text text_new
0 event mon and nov event monday and november
1 no event on friday no event on friday
2 december is good december is good
3 welcome jan again welcome january again