Search code examples
javascriptnode.jsmongodbemail-parsing

Find Email-adresses in the mailbody with Mailparser


I'm quite new to the topic and i'm still having some issues with my mailparser. Though searching and finding emails in the email header (mail.from) does work, it doesn't work in the email body. Does anybody have some experience with that and is willing to help? You can find the function i'm talking about under the "// Check for other addresses in Mail-Body (Doesn't work yet)"-comment. I think, that my Regex is correct. Also if the matchAll-Function give back an array and it can't be saved in the the subscriber.email-object, it shall be at least logged to the console. Also i checked manually in the inbox if there are mails with email adresses in the mail body. There are at least two, which shall be found..

The part of the App.js, that does the mailparsing:

const simpleParser = require('mailparser').simpleParser;
//const htmlparser = require("htmlparser2");
var fs = require('fs');

var config = require('./config');

var Imap = require('imap');
var imap = new Imap(config.imap);

var blacklistString = '';

String.prototype.matchAll = function(regexp) {
    var matches = [];
    this.replace(regexp, function() {
        var arr = ([]).slice.call(arguments, 0);
        var extras = arr.splice(-2);
        arr.index = extras[0];
        arr.input = extras[1];
        matches.push(arr);
    });
    return matches.length ? matches : null;
 };

function openInbox(subbox,cb) {
    imap.openBox('INBOX.'+subbox, true, cb);
}

function getBoxes(cb) {
    imap.getBoxes(cb);
}
function showBoxes(boxes) {
    imap.end();
}

function logArrayElements(element) {
    if(element[1].indexOf('placeholder.de')==-1){
        addToBlacklistString(element[1]);
    }
}
 function addToBlacklistString(str) {
    blacklistString += str+"\n";
}
function writeBlacklistFile() {
    fs.appendFile('data/data.csv', blacklistString, function (err) {
        if (err) throw err;
        console.log('Saved!');
    });
}



function search(searchArray, regex){
     imap.search(searchArray, function(err, results) {
        if (err) throw err;
        var temp = 0;
        var mailtemp = [];
        var f = imap.fetch(results, { bodies: '' });
        f.on('message', function(msg, seqno) {
            console.log('Message #%d', seqno);
            var prefix = '(#' + seqno + ') ';
             msg.on('body', function(stream, info) {
                simpleParser(stream, (err, mail)=>{
                    //console.log(temp);
                    //console.log(mail.subject);
                      /*fs.writeFile('data/'+seqno+'.txt',mail.text, function(err){
                        console.log(err);
                     });*/

                    //var text = mail.text;

                    // New Subscriber Object
                     var subscr = new Subscriber({nr: '', mailIdent: '', from: '', emails: '', text:'', uLink: '', anwalt: false });
                     subscr.nr = seqno;

                    //Check for From-Address

                    if(!!mail.from) {
                         //console.log(mail.from.value);
                        for(var i = 0; i < mail.from.value.length; i++) {
                            mailtemp = mail.from.value[i].address.matchAll(regex);

                             mailtemp.forEach(function(element){
                            /*fs.appendFile('data/data.csv', element[0] + "\n", function(error){
                                console.log(error);
                            });*/

                            subscr.from = element[0];

                        });
                        if(!!mailtemp) {
                            mailtemp.forEach(logArrayElements);
                        }

                    }

                }else{
                    //console.log(mail.text);
                }

                // Message-ID

                if(!!mail.messageId) {
                    subscr.mailIdent = mail.messageId;
                }
                console.log(mail.messageId);

                // Check for other addresses in Mail-Body (Doesn't work yet)

                var regexEmails = new RegExp('/([\w\.\-\_\#\+]+@[\w\.\-\_äüö]+\.[a-zA-Z]+)/g');
                if(!!mail.text){
                    if(mail.text.matchAll(regexEmails)!=null) {
                        subscr.emails = mail.text.matchAll(regexEmails);
                        console.log(subscr.emails);
                    }
                }

                /* Split mail.text at substrings in substr-array. Extend if necessary..
                 *
                 * Also check for 'Anwalt'-Expression in splitted Substring
                 *
                 * If mail.text doesn't exist -> Check for html body and convert it to text-format
                 */

                //var  regexLink = new RegExp('\.de\/(unsubscribe|austragen)\/([^\"]+)');
                var  regexAnwalt = new RegExp('nwalt|echtsanwalt|rechtlicher');

                if(!!mail.text) {
                    var substr = ["schrieb pplaceholder.de", "Von: \"placeholder.de", "Von: pplaceholder.de", "From: placeholder.de", "Ursprüngliche Nachricht"];
                    for (var i = 0; i<substr.length; i++) {
                        if(mail.text.indexOf(substr[i]) > -1) {
                            var textTemp = mail.text;
                            var arr = textTemp.split(substr[i]);
                            if(arr[0].matchAll(regexAnwalt)!=null) {
                                subscr.anwalt = true;
                            };

                            subscr.text = arr[0];
                            break;
                        } else {
                            subscr.text = mail.text;
                        }
                    }


                    //console.log(arr);

                }
                else
                {
                    var html = mail.html;
                    var text = htmlToText.fromString(html, {
                        noLinkBrackets: true,
                        ignoreImage: true,
                        uppercaseHeadings: false,
                        preserveNewlines: false,
                        wordwrap:130,
                        format: {
                            heading: function (node, fn, options) {
                                var h = fn(node.children, options);
                                return '\n==== ' + h + ' ====\n\n';
                            }
                        }
                    });
                    subscr.text = text;
                }


                mail.headers.forEach(function(value, key) {
                    //console.log(value);
                });

                subscr.save();
                //console.log(subscr);
                temp++;
            });
        });
        msg.once('end', function() {
            console.log(prefix + 'Finished');
        });
    });
    f.once('error', function(err) {
        console.log('Fetch error: ' + err);
    });
    f.once('end', function() {
        console.log('Done fetching all messages!');
        //writeBlacklistFile();
        imap.end();
    });
});
}

imap.once('ready', function() {
 openInbox('Test',function(err, box) {
    var searchArray = [['FROM', '@']];
    search(searchArray,/([\w\.\-\_\#\+]+@[\w\.\-\_äüö]+\.[a-zA-Z]+)/g);
    });
 });
imap.once('error', function(err) {
console.log(err);
});

imap.once('end', function() {
    console.log('Connection ended');
});

imap.connect();

app.listen(2700, function(){
  console.log("Listening on Port 2700")
});

module.exports = app;

subscriber.js

const mongoose = require('mongoose');

var subscriberSchema = mongoose.Schema({

    nr: Number,
    mailIdent: String,
    from: String,
    emails: String,
    text: String,
    uLink: String,
    anwalt: Boolean
});

var Subscriber = module.exports = mongoose.model('Subscriber', subscriberSchema);

//get Subscriber
module.exports.getSubscribers = function(callback, limit){
Subscriber.find(callback).limit(limit);
};

module.exports.getSubscriberByID = function(_id, callback){
    Subscriber.findById(_id, callback);
};

Solution

  • The Regex for the Emails was a little bit wrong. Also i didn't noticed that the matchAll-Fct. is giving back a two-dimensional Array. Here is the changed part of the code:

    var regexEmails = new RegExp("([\\w\\.\\-\\_\\#\\+]+@[\\w\\.\\-\\_äüö]+\\.[a-zA-Z]+)");
    
    var temp1 = mail.text.matchAll(regexEmails);
                        if(!!temp1){
                            //console.log(temp1);
                            for(var i =0; i<temp1.length; i++) {
                                if(temp1[0][i]!=='[email protected]' && temp1[0][i] !== "[email protected]"){
                                    subscr.emails += temp1[0][i];
                                }
                            }
    
                        }