Search code examples
pythonemail-parsing

Extract Attachment which itself is of type MSG from an EMAIL in python


I need to extract an msg type attachment from an email and save the MSG attachment to a location in python.

The script i wrote works for nearly all types of files other than outlook items

def parse_attachment(message_part):
    content_disposition = message_part.get("Content-Disposition", None)
    if content_disposition:
        dispositions = content_disposition.strip().split(";")
        if bool(content_disposition and (dispositions[0].lower() == "attachment" or dispositions[0].lower() == "inline")):

            file_data = message_part.get_payload(decode=True)
            debug(message_part)
            attachment = {}
            attachment['data'] = file_data
            attachment['content_type'] = message_part.get_content_type()
            attachment['size'] = len(file_data)

            for param in dispositions[1:]:
                name,value = param.split("=")
                name = name.lower().strip()
                value = value.strip().strip("\"")

                if name == "filename":
                    attachment['name'] = value
                elif name == "creation-date":
                    attachment['creation-date'] = value
                elif name == "modification-date":
                    attachment['modification-date'] = value
                elif name == "size":
                    attachment['size'] = value
            return attachment

    return None

Solution

  • We have to handle email attachments separately. However, if we use walk(), which is an all-purpose generator which can be used to iterate over all the parts and subparts of a message object tree, in depth-first traversal order, we end up parsing the attachment email as well.

    So, we will have to use get_payload() for getting each individual part of the email. Here is how we can parse the email attachments -

    def get_subject(msgobj) :
        subject = None
        if msgobj['Subject'] is not None:
            decodefrag = decode_header(msgobj['Subject'])
            subj_fragments = []
            for s , enc in decodefrag:
                if enc:
                    s = unicode(s , enc).encode('utf8','replace')
                subj_fragments.append(s)
            subject = ''.join(subj_fragments)
            subject = re.sub('\n', '', subject)
        return subject
    
    def get_msg_file_as_attachment(message_part):
        attachment = {}
        attachment['data'] = message_part.get_payload()[0].as_string(unixfrom=True)
        attachment['content_type'] = message_part.get_content_type()
        attachment['name'] = get_subject(message_part.get_payload()[0])
        attachment['name'] +=  '.eml'
        attachment['size'] = len(attachment['data'])
    
        return attachment
    
    def parse_attachment(message_part):
        content_disposition = message_part.get("Content-Disposition", None)
        content_type = message_part.get_content_type()
        if content_disposition:
            dispositions = content_disposition.strip().split(";")
            if bool(content_disposition and (dispositions[0].lower() == "attachment" or dispositions[0].lower() == "inline")):
                if (content_type.lower().strip() == 'message/rfc822'):
                    return get_msg_file_as_attachment(message_part)
    
                else:
                    file_data = message_part.get_payload(decode=True)
                    attachment = {}
                    attachment['data'] = file_data
                    attachment['content_type'] = content_type
                    attachment['size'] = len(file_data)
                    attachment['name'] = message_part.get_filename()
                    return attachment
    
        return None