Search code examples
pythonregexgmail

Regular expression not stripping text from quoted reply


I'm trying to parse out the text of an email reply and drop the quoted text (and anything that follows it, including the signature)

This code is returning: message tests On Tue, Jun 25, 2013 at 10:01 PM, Catie Brand <

I want it to return simply message tests

What regex am I missing?

def format_mail_plain(value, from_address):
    res = [re.compile(r'From:\s*' + re.escape(from_address), re.IGNORECASE),
           re.compile('<' + re.escape(from_address) + '>', re.IGNORECASE),
           re.compile(r'\s+wrote:', re.IGNORECASE  | re.MULTILINE),
           re.compile(r'On.*?wrote:.*?', re.IGNORECASE | re.MULTILINE | re.DOTALL),
           re.compile(r'-+original\s+message-+\s*$', re.IGNORECASE),
           re.compile(r'from:\s*$', re.IGNORECASE),
           re.compile(r'^>.*$', re.IGNORECASE | re.MULTILINE)]

    whitespace_re = re.compile(r'\s+')

    lines = list(line.rstrip() for line in value.split('\n'))

    result = ''
    for line_number, line in zip(range(len(lines)), lines):
        for reg_ex in res:
            if reg_ex.search(line):
                return result

        if not whitespace_re.match(line):
            if '' is result:
                result += line
            else:
                result += '\n' + line

    return result




************************ Sample Text *****************************
message tests 
On Tue, Jun 25, 2013 at 10:01 PM, XXXXX XXXX < 
[email protected]> wrote: 
> ** 
>    [image: Krow] <http://www.krow.com/>


************************ Result **********************************
message tests
On Tue, Jun 25, 2013 at 10:01 PM, XXXXX XXXX <

I'd rather the result be:

************************ Result **********************************
message tests

Solution

  • In your sample input, On.*?wrote does not match, because On ... wrote: spans two lines.

    I changed your code to substitute On.*wrote:\s* to empty string.

    def format_mail_plain(value, from_address):
        value = re.compile(r'^On.*?wrote:\s*', re.IGNORECASE | re.MULTILINE | re.DOTALL).sub('', value)
        res = [re.compile(r'From:\s*' + re.escape(from_address), re.IGNORECASE),
               re.compile('<' + re.escape(from_address) + '>', re.IGNORECASE),
               re.compile(r'-+original\s+message-+\s*$', re.IGNORECASE),
               re.compile(r'^from:', re.IGNORECASE),
               re.compile(r'^>')]
    
        lines = filter(None, [line.rstrip() for line in value.split('\n')])
    
        result = []
        for line in lines:
            result.append(line)
            for reg_ex in res:
                if reg_ex.search(line):
                    result.pop()
                    break
    
        return '\n'.join(filter(None, result))