I currently have a separated file in this format (3 columns tab "\t" separated), and “;” separates all the elements within the columns).
COL1\tCOL2\tCOL3
abc\t123;1q\tapple\t
dfg\t234;2w\tapple;apple\t
hij\t345;3e\tbanana;apple;cherry;\t
klm\t456;4r\tapple;banana;cherry;banana;cherry;\t
nop\t567;5t\t;;apple;banana;cherry;banana;;cherry;;\t
I would like any thoughts on an optimized macro (ideally javascript) to manipulate the file to OUTPUT this: Column 3 is now sorted (any extra/unrequired delimiters are removed as well)and duplicates removed. New column 4 is the deduplicated element count.
abc\t123;1q\tapple\t1
dfg\t234;2w\tapple\t1
hij\t345;3e\tapple;banana;cherry\t3
klm\t456;4r\tapple;banana;cherry\t3
nop\t567;5t\tapple;banana;cherry\t3
I have been trying similar to the below, but this method could be quicker I think.
for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row, start at 2nd row
str = document.GetCell(iRow, 2, eeCellIncludeQuotes);
var count = (str.match(/;/g) || []).length;
var numOfElements = count + 1;
document.SetCell( iRow, 3, numOfElements, eeAutoQuote );
}
So the user should select the column they want to run this on (Column 3 in this example) and the macro would run on that column only, and output the count to a new column to the right.
The real source files will be millions of lines, so if this could be EmEditor optimized in any way, that would be great.
I optimized your macro by creating a function to count semicolons in a string rather than using a regular expression (Second version), and also used GetColumn
and SetColumn
methods to increase the speed (Third version). The third version will insert a column rather than overwrite the existing column.
Original macro (modified for correctness and timing)
var start = new Date().getTime();
var totalLines = document.GetLines();
for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row, start at 2nd row
str = document.GetCell(iRow, 3, eeCellIncludeQuotes);
var count = (str.match(/;/g) || []).length;
var numOfElements = count + 1;
document.SetCell( iRow, 4, numOfElements, eeAutoQuote );
}
var end = new Date().getTime();
var time = end - start;
alert( "Execution time: " + time + " ms" );
Second version
function CountSemiColon( str )
{
var count = 0;
for( var index = -1; ; ) {
index = str.indexOf( ';', index + 1 );
if( index == -1 ) {
break;
}
++count;
}
return count;
}
var start = new Date().getTime();
var totalLines = document.GetLines();
for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row, start at 2nd row
var str = document.GetCell(iRow, 3, eeCellIncludeQuotes);
document.SetCell( iRow, 4, CountSemiColon( str ) + 1, eeAutoQuote );
}
var end = new Date().getTime();
var time = end - start;
alert( "Execution time: " + time + " ms" );
Third version
function CountSemiColon( str )
{
var count = 0;
for( var index = -1; ; ) {
index = str.indexOf( ';', index + 1 );
if( index == -1 ) {
break;
}
++count;
}
return count;
}
var start = new Date().getTime();
var totalLines = document.GetLines();
s1 = document.GetColumn( 3, "\n", eeCellIncludeQuotesAndDelimiter, 2, totalLines - 1 );
sLines = s1.split( "\n" );
s2 = "";
nTotal = sLines.length;
for( y = 0; y < nTotal; y++ ) {
s2 += CountSemiColon( sLines[y] ) + 1 + "\n";
}
x = s2.length;
if( x > 0 ) s2 = s2.substr( 0, x - 1 );
document.InsertColumn( 4, s2, "\n", eeDontQuote, 2 );
var end = new Date().getTime();
var time = end - start;
alert( "Execution time: " + time + " ms" );
Fourth version (returns 0 for empty cell)
function CountElements( str )
{
if( str.length == 0 || str == '\t' ) { // if empty string or delimiter only, return 0
return 0;
}
var count = 0;
for( var index = -1; ; ) {
index = str.indexOf( ';', index + 1 );
if( index == -1 ) {
break;
}
++count;
}
return count + 1; // add 1 to the Count
}
var start = new Date().getTime();
var totalLines = document.GetLines();
s1 = document.GetColumn( 3, "\n", eeCellIncludeQuotesAndDelimiter, 2, totalLines - 1 );
sLines = s1.split( "\n" );
s2 = "";
nTotal = sLines.length;
for( y = 0; y < nTotal; y++ ) {
s2 += CountElements( sLines[y] ) + "\n";
}
x = s2.length;
if( x > 0 ) s2 = s2.substr( 0, x - 1 );
document.InsertColumn( 4, s2, "\n", eeDontQuote, 2 );
var end = new Date().getTime();
var time = end - start;
alert( "Execution time: " + time + " ms" );
Test results:
1 million line, 52 MB CSV file.
If this is not fast enough, or "Out of Memory" error occurs, I will think about other methods or optimize even further, so please let me know.