I currently have a separated file in this format (2 columns tab "\t" separated), and “;” separates all the elements within the columns).
User\tDate
Alice Cooper;John Smith\t07/11/2019
Alice Cooper\t23/11/2018
Alice Cooper\t21/11/2018
Alice Cooper\t26/11/2018
Alice Cooper\t26/11/2018
Alice Cooper;John Smith\t09/12/2018
Alice Cooper;John Smith\t09/12/2018
Alice Cooper;John Smith\t04/12/2018
Alice Cooper\t07/12/2018
Alice Cooper\t07/12/2018
I would like any thoughts on an optimized macro (ideally javascript) to create the following OUTPUT file:
User\tEarliest\tLatest\tDates_with_Most_Occurences\tMost_Occurence_Number
Alice Cooper\t21/11/2018\t07/11/2019\t26/11/2018;07/12/2018\t2
John Smith\t04/12/2018\t07/11/2019\t09/12/2018\t1
So the intermediate steps (I am currently carrying out manually, but would like to roll into a macro):
Step 1: Separate out the Name elements in Columns 1
(giving something like this):
User\tDate
Alice Cooper\t07/11/2019
John Smith\t07/11/2019
Alice Cooper\t23/11/2018
Alice Cooper\t21/11/2018
Alice Cooper\t26/11/2018
Alice Cooper\t26/11/2018
Alice Cooper\t09/12/2018
John Smith\t09/12/2018
Alice Cooper\t09/12/2018
John Smith\t09/12/2018
Alice Cooper\t04/12/2018
John Smith\t04/12/2018
Alice Cooper\t07/12/2018
Alice Cooper\t07/12/2018
Step 2: Sort Col1 A-Z, and Col 2 Oldest to Newest. Now combine Column 2 elements based on Column 1 (gives something like this):
User\tDate
Alice Cooper\t21/11/2018;23/11/2018;26/11/2018;26/11/2018;04/12/2018;07/12/2018;07/12/2018;09/12/2018;09/12/2018;07/11/2019;
John Smith\t04/12/2018;09/12/2018;09/12/2018;07/11/2019;
Step 3: Now take the date information in Col2 for each line and create these 4 New Columns: EarliestDate, LatestDate, Dates_with_Most_Occurences, Most_Occurence_Number (gives something like this):
User\tDate
Alice Cooper\t21/11/2018;23/11/2018;26/11/2018;26/11/2018;04/12/2018;07/12/2018;07/12/2018;09/12/2018;09/12/2018;07/11/2019;
John Smith\t04/12/2018;09/12/2018;09/12/2018;07/11/2019;
Step 4: Remove Col2 (Date): Giving the final output:
User\tEarliestDate\tLatestDate\tDates_with_Most_Occurences\tMost_Occurence_Number
Alice Cooper\t21/11/2018\t07/11/2019\t26/11/2018;07/12/2018\t2
John Smith\t04/12/2018\t07/11/2019\t09/12/2018\t1
I only need the macro to create the final output, the in-between (Steps 1,2,3 above) are just showing the logic of what I’m trying to do. The real source files will be thousands of lines, so if this could be EmEditor optimized in any way, that would be fantastic.
Assuming your data file doesn't contain empty lines, here is the script.
document.ConvertCsv(2); // This assumes your Tab format is the second one on the CSV/Sort bar
function parseDate(s) {
var split = s.split('/');
return new Date(split[2], split[1] - 1, split[0]);
}
var data = [];
// Read the file
var numberOfLines = document.GetLines();
if (numberOfLines >= 2 && document.GetLine(2) === '') {
numberOfLines = 1; // CSV document only has header without data
}
for (var line = 1; line < numberOfLines; line++) {
var rowData = [
document.GetCell(line + 1, 1, eeCellIncludeNone),
parseDate(document.GetCell(line + 1, 2, eeCellIncludeNone)),
];
data.push(rowData);
}
// Separate combined users
var separated = [];
for (var row = 0; row < data.length; row++) {
var split = data[row][0].split(';');
for (var i = 0; i < split.length; i++) {
separated.push([split[i], data[row][1]]);
}
}
// Group by user
// {[key: string]: {data: [[]], earliest: Date, latest: Date, mostOccurrence: Date, occurrence: number}}
var users = {};
for (var row = 0; row < separated.length; row++) {
if (!(separated[row][0] in users)) {
users[separated[row][0]] = {data: []};
}
users[separated[row][0]].data.push(separated[row]);
}
// At this point, we have parsed the file into useful data.
// alert(JSON.stringify(users, null, ' ')); // To check
// Data analysis
for (var userKey in users) {
var sorted = users[userKey].data.sort(function(a, b) {
return a[1].getTime() - b[1].getTime();
});
users[userKey].earliest = sorted[0][1];
users[userKey].latest = sorted[sorted.length - 1][1];
// Count dates
var dates = {}; // {[key: number]: number}
for (var i = 0; i < sorted.length; ++i) {
if (!(sorted[i][1].getTime() in dates)) {
dates[sorted[i][1].getTime()] = 0;
}
dates[sorted[i][1].getTime()] += 1;
}
var mostOccurrence = {date: [], occurrence: -1};
for (var k in dates) {
if (dates[k] > mostOccurrence.occurrence) {
mostOccurrence = {date: [k], occurrence: dates[k]}
} else if (dates[k] === mostOccurrence.occurrence) {
mostOccurrence.date.push(k);
}
}
users[userKey].mostOccurrence = [];
for (var i = 0; i < mostOccurrence.date.length; i++) {
var date = new Date();
date.setTime(mostOccurrence.date[i]);
users[userKey].mostOccurrence.push(date);
}
users[userKey].occurrence = mostOccurrence.occurrence;
}
// Format the numbers and output to document
editor.NewFile();
document.selection.Text = 'User\tEarliestDate\tLatestDate\tDates_with_Most_Occurences\tMost_Occurence_Number';
for (var _ in users) {
document.selection.Text += '\r\n';
}
document.ConvertCsv(2);
function formatDate(d) {
return d.getDate() + '/' + (d.getMonth() + 1) + '/' + d.getFullYear();
}
var line = 2;
for (var userKey in users) {
document.SetCell(line, 1, userKey, eeAutoQuote);
document.SetCell(line, 2, formatDate(users[userKey].earliest), eeAutoQuote);
document.SetCell(line, 3, formatDate(users[userKey].latest), eeAutoQuote);
var mostOccurrenceStr = '';
for (var i = 0; i < users[userKey].mostOccurrence.length; i++) {
mostOccurrenceStr += formatDate(users[userKey].mostOccurrence[i]) + ';';
}
document.SetCell(line, 4, mostOccurrenceStr.substring(0, mostOccurrenceStr.length - 1), eeAutoQuote);
document.SetCell(line, 5, users[userKey].occurrence, eeAutoQuote);
line++;
}
Hopefully it works but if it doesn't, let me know.