Search code examples
javascriptperformanceemeditor

Optimised EmEditor Macro to Element Sort, Split, and Dedupe a single Column and Extract Count


I currently have a separated file in this format (3 columns tab "\t" separated), and “;” separates all the elements within the columns).

    COL1\tCOL2\tCOL3
    abc\t123;1q\tapple\t
    dfg\t234;2w\tapple;apple\t
    hij\t345;3e\tbanana;apple;cherry;\t
    klm\t456;4r\tapple;banana;cherry;banana;cherry;\t
    nop\t567;5t\t;;apple;banana;cherry;banana;;cherry;;\t

I would like any thoughts on an optimized macro (ideally javascript) to manipulate the file to OUTPUT this: Column 3 is now sorted (any extra/unrequired delimiters are removed as well)and duplicates removed. New column 4 is the deduplicated element count.

    abc\t123;1q\tapple\t1
    dfg\t234;2w\tapple\t1
    hij\t345;3e\tapple;banana;cherry\t3
    klm\t456;4r\tapple;banana;cherry\t3
    nop\t567;5t\tapple;banana;cherry\t3

I have been trying similar to the below, but this method could be quicker I think.

    for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row, start at 2nd row
      str = document.GetCell(iRow, 2, eeCellIncludeQuotes);
      var count = (str.match(/;/g) || []).length;
      var numOfElements = count + 1;
      document.SetCell( iRow, 3, numOfElements, eeAutoQuote );
    }

So the user should select the column they want to run this on (Column 3 in this example) and the macro would run on that column only, and output the count to a new column to the right.

The real source files will be millions of lines, so if this could be EmEditor optimized in any way, that would be great.


Solution

  • I optimized your macro by creating a function to count semicolons in a string rather than using a regular expression (Second version), and also used GetColumn and SetColumn methods to increase the speed (Third version). The third version will insert a column rather than overwrite the existing column.

    1. Original macro (modified for correctness and timing)

      var start = new Date().getTime();
      
      var totalLines = document.GetLines();
      for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row, start at 2nd row
          str = document.GetCell(iRow, 3, eeCellIncludeQuotes);
          var count = (str.match(/;/g) || []).length;
          var numOfElements = count + 1;
          document.SetCell( iRow, 4, numOfElements, eeAutoQuote );
      }
      
      var end = new Date().getTime();
      var time = end - start;
      alert( "Execution time: " + time + " ms" );
      
    2. Second version

      function CountSemiColon( str )
      {
          var count = 0;
          for( var index = -1; ; ) {
              index = str.indexOf( ';', index + 1 );
              if( index == -1 ) {
                  break;
              }
              ++count;
          }
          return count;
      }
      
      var start = new Date().getTime();
      
      var totalLines = document.GetLines();
      for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row, start at 2nd row
          var str = document.GetCell(iRow, 3, eeCellIncludeQuotes);
          document.SetCell( iRow, 4, CountSemiColon( str ) + 1, eeAutoQuote );
      }
      
      var end = new Date().getTime();
      var time = end - start;
      alert( "Execution time: " + time + " ms" );
      
    3. Third version

      function CountSemiColon( str )
      {
          var count = 0;
          for( var index = -1; ; ) {
              index = str.indexOf( ';', index + 1 );
              if( index == -1 ) {
                  break;
              }
              ++count;
          }
          return count;
      }
      
      var start = new Date().getTime();
      
      var totalLines = document.GetLines();
      s1 = document.GetColumn( 3, "\n", eeCellIncludeQuotesAndDelimiter, 2, totalLines - 1 );
      sLines = s1.split( "\n" );
      s2 = "";
      nTotal = sLines.length;
      for( y = 0; y < nTotal; y++ ) {
          s2 += CountSemiColon( sLines[y] ) + 1 + "\n";
      }
      x = s2.length;
      if( x > 0 ) s2 = s2.substr( 0, x - 1 );
      document.InsertColumn( 4, s2, "\n", eeDontQuote, 2 );
      
      var end = new Date().getTime();
      var time = end - start;
      alert( "Execution time: " + time + " ms" );
      
    4. Fourth version (returns 0 for empty cell)

      function CountElements( str )
      {
          if( str.length == 0 || str == '\t' ) {   // if empty string or delimiter only, return 0
              return 0;
          }
          var count = 0;
          for( var index = -1; ; ) {
              index = str.indexOf( ';', index + 1 );
              if( index == -1 ) {
                  break;
              }
              ++count;
          }
          return count + 1;   // add 1 to the Count
      }
      
      var start = new Date().getTime();
      
      var totalLines = document.GetLines();
      s1 = document.GetColumn( 3, "\n", eeCellIncludeQuotesAndDelimiter, 2, totalLines - 1 );
      sLines = s1.split( "\n" );
      s2 = "";
      nTotal = sLines.length;
      for( y = 0; y < nTotal; y++ ) {
          s2 += CountElements( sLines[y] ) + "\n";
      }
      x = s2.length;
      if( x > 0 ) s2 = s2.substr( 0, x - 1 );
      document.InsertColumn( 4, s2, "\n", eeDontQuote, 2 );
      
      var end = new Date().getTime();
      var time = end - start;
      alert( "Execution time: " + time + " ms" );
      

    Test results:

    1. 10429 ms
    2. 8496 ms
    3. 1803 ms
    4. 1890 ms

    1 million line, 52 MB CSV file.

    If this is not fast enough, or "Out of Memory" error occurs, I will think about other methods or optimize even further, so please let me know.