Search code examples
javascriptregexperformanceemeditor

Optimised EmEditor Macro to Element Split, Sort, Oldest to Longest Date, & Extract


I currently have a separated file in this format (2 columns tab "\t" separated), and “;” separates all the elements within the columns).

    User\tDate

    Alice Cooper;John Smith\t07/11/2019

    Alice Cooper\t23/11/2018

    Alice Cooper\t21/11/2018

    Alice Cooper\t26/11/2018

    Alice Cooper\t26/11/2018

    Alice Cooper;John Smith\t09/12/2018

    Alice Cooper;John Smith\t09/12/2018

    Alice Cooper;John Smith\t04/12/2018

    Alice Cooper\t07/12/2018

    Alice Cooper\t07/12/2018

I would like any thoughts on an optimized macro (ideally javascript) to create the following OUTPUT file:

    User\tEarliest\tLatest\tDates_with_Most_Occurences\tMost_Occurence_Number

    Alice Cooper\t21/11/2018\t07/11/2019\t26/11/2018;07/12/2018\t2

    John Smith\t04/12/2018\t07/11/2019\t09/12/2018\t1

So the intermediate steps (I am currently carrying out manually, but would like to roll into a macro):

Step 1: Separate out the Name elements in Columns 1

(giving something like this):

    User\tDate

    Alice Cooper\t07/11/2019

    John Smith\t07/11/2019

    Alice Cooper\t23/11/2018

    Alice Cooper\t21/11/2018

    Alice Cooper\t26/11/2018

    Alice Cooper\t26/11/2018

    Alice Cooper\t09/12/2018

    John Smith\t09/12/2018

    Alice Cooper\t09/12/2018

    John Smith\t09/12/2018

    Alice Cooper\t04/12/2018

    John Smith\t04/12/2018

    Alice Cooper\t07/12/2018

    Alice Cooper\t07/12/2018

Step 2: Sort Col1 A-Z, and Col 2 Oldest to Newest. Now combine Column 2 elements based on Column 1 (gives something like this):

    User\tDate


    Alice Cooper\t21/11/2018;23/11/2018;26/11/2018;26/11/2018;04/12/2018;07/12/2018;07/12/2018;09/12/2018;09/12/2018;07/11/2019;

    John Smith\t04/12/2018;09/12/2018;09/12/2018;07/11/2019;

Step 3: Now take the date information in Col2 for each line and create these 4 New Columns: EarliestDate, LatestDate, Dates_with_Most_Occurences, Most_Occurence_Number (gives something like this):

    User\tDate

    Alice Cooper\t21/11/2018;23/11/2018;26/11/2018;26/11/2018;04/12/2018;07/12/2018;07/12/2018;09/12/2018;09/12/2018;07/11/2019;

    John Smith\t04/12/2018;09/12/2018;09/12/2018;07/11/2019;

Step 4: Remove Col2 (Date): Giving the final output:

    User\tEarliestDate\tLatestDate\tDates_with_Most_Occurences\tMost_Occurence_Number

    Alice Cooper\t21/11/2018\t07/11/2019\t26/11/2018;07/12/2018\t2

    John Smith\t04/12/2018\t07/11/2019\t09/12/2018\t1

I only need the macro to create the final output, the in-between (Steps 1,2,3 above) are just showing the logic of what I’m trying to do. The real source files will be thousands of lines, so if this could be EmEditor optimized in any way, that would be fantastic.


Solution

  • Assuming your data file doesn't contain empty lines, here is the script.

    document.ConvertCsv(2); // This assumes your Tab format is the second one on the CSV/Sort bar
    
    function parseDate(s) {
        var split = s.split('/');
        return new Date(split[2], split[1] - 1, split[0]);
    }
    
    var data = [];
    
    // Read the file
    var numberOfLines = document.GetLines();
    if (numberOfLines >= 2 && document.GetLine(2) === '') {
        numberOfLines = 1; // CSV document only has header without data
    }
    
    for (var line = 1; line < numberOfLines; line++) {
        var rowData = [
            document.GetCell(line + 1, 1, eeCellIncludeNone),
            parseDate(document.GetCell(line + 1, 2, eeCellIncludeNone)),
        ];
        data.push(rowData);
    }
    
    // Separate combined users
    var separated = [];
    for (var row = 0; row < data.length; row++) {
        var split = data[row][0].split(';');
        for (var i = 0; i < split.length; i++) {
            separated.push([split[i], data[row][1]]);
        }
    }
    
    // Group by user
    // {[key: string]: {data: [[]], earliest: Date, latest: Date, mostOccurrence: Date, occurrence: number}}
    var users = {};
    
    for (var row = 0; row < separated.length; row++) {
        if (!(separated[row][0] in users)) {
            users[separated[row][0]] = {data: []};
        }
    
        users[separated[row][0]].data.push(separated[row]);
    }
    
    // At this point, we have parsed the file into useful data.
    // alert(JSON.stringify(users, null, '  ')); // To check
    
    // Data analysis
    for (var userKey in users) {
        var sorted = users[userKey].data.sort(function(a, b) {
            return a[1].getTime() - b[1].getTime();
        });
        users[userKey].earliest = sorted[0][1];
        users[userKey].latest = sorted[sorted.length - 1][1];
    
        // Count dates
        var dates = {}; // {[key: number]: number}
        for (var i = 0; i < sorted.length; ++i) {
            if (!(sorted[i][1].getTime() in dates)) {
                dates[sorted[i][1].getTime()] = 0;
            }
    
            dates[sorted[i][1].getTime()] += 1;
        }
    
        var mostOccurrence = {date: [], occurrence: -1};
        for (var k in dates) {
            if (dates[k] > mostOccurrence.occurrence) {
                mostOccurrence = {date: [k], occurrence: dates[k]}
            } else if (dates[k] === mostOccurrence.occurrence) {
                mostOccurrence.date.push(k);
            }
        }
    
        users[userKey].mostOccurrence = [];
        for (var i = 0; i < mostOccurrence.date.length; i++) {
            var date = new Date();
            date.setTime(mostOccurrence.date[i]);
            users[userKey].mostOccurrence.push(date);
        }
        users[userKey].occurrence = mostOccurrence.occurrence;
    }
    
    // Format the numbers and output to document
    editor.NewFile();
    document.selection.Text = 'User\tEarliestDate\tLatestDate\tDates_with_Most_Occurences\tMost_Occurence_Number';
    for (var _ in users) {
        document.selection.Text += '\r\n';
    }
    document.ConvertCsv(2);
    
    function formatDate(d) {
        return d.getDate() + '/' + (d.getMonth() + 1) + '/' + d.getFullYear();
    }
    
    var line = 2;
    for (var userKey in users) {
        document.SetCell(line, 1, userKey, eeAutoQuote);
        document.SetCell(line, 2, formatDate(users[userKey].earliest), eeAutoQuote);
        document.SetCell(line, 3, formatDate(users[userKey].latest), eeAutoQuote);
        var mostOccurrenceStr = '';
        for (var i = 0; i < users[userKey].mostOccurrence.length; i++) {
            mostOccurrenceStr += formatDate(users[userKey].mostOccurrence[i]) + ';';
        }
        document.SetCell(line, 4, mostOccurrenceStr.substring(0, mostOccurrenceStr.length - 1), eeAutoQuote);
        document.SetCell(line, 5, users[userKey].occurrence, eeAutoQuote);
        line++;
    }
    

    Hopefully it works but if it doesn't, let me know.