Search code examples
pythonpandaspython-re

Matching and replacing month and day in a pandas dataframe


I am trying to find and replace partial month and day in a pandas dataframe with the complete month and date format. But no change in the string

Code

import pandas as pd
data = {'text':['event mon and nov', 'no event on friday', 'december is good', 'welcome jan again']}
df = pd.DataFrame(data)
month = {"jan":"january","feb":"february","mar":"march","apr":"april","may":"may","jun":"june",
        "jul":"july","aug":"august","sep":"september","oct":"october","nov":"november","dec":"december"}
day = {"sun":"sunday","mon":"monday","tue":"tuesday","wed":"wednesday","thu":"thursday","fri":"friday",
      "sat":"saturday"}
df["text_new"] = df["text"].apply(lambda x : re.compile(r"(?:(?:(?:j|J)an)|(?:(?:f|F)eb)| \
(?:(?:m|M)ar)|(?:(?:a|A)pr)|(?:(?:m|M)ay)|(?:(?:j|J)un)|(?:(?:j|J)ul)|(?:(?:a|A)ug)|(?:(?:s|S)ep)|(?:(?:o|O)ct)| \
(?:(?:n|N)ov)|(?:(?:d|D)ec))(?:\s)".join(month)).sub(lambda m: month.get(m.group()), x))

Expected output dataframe

0   event monday and november
1   no event on friday
2   december is good
3   welcome january again

Thanks in advance


Solution

  • You should use pandas regex capabilities (with str.replace):

    import re
    
    dm = month.copy()
    dm.update(day)
    # or for python ≥3.9
    # dm = day|month
    
    regex = fr'\b({"|".join(dm)})\b'
    
                                            # if match, replace with dict value
    df["text_new"] = df['text'].str.replace(regex, lambda x: dm.get(x.group(), x),
                                            regex=True, # use regex
                                            flags=re.I) # case insensitive
    

    output:

                     text                   text_new
    0   event mon and nov  event monday and november
    1  no event on friday         no event on friday
    2    december is good           december is good
    3   welcome jan again      welcome january again