Search code examples
shellbatch-filecmdcygwin

Converting unicode entities in Unicode text


I have a text file full with unicode entities. Is there a way to convert all those entities to their equivalent in text by cmd/batch or cygwin. I can't seem to find much information about this.

i.e journal\u0027s would become journal's


Solution

  • Save this with .bat extension e.g. decodeStrings.bat :

    0</* :
    @echo off
    
        cscript /nologo /E:jscript "%~f0" %*
    
    exit /b %errorlevel% */0;
    
    
        var jsEscapes = {
          'n': '\n',
          'r': '\r',
          't': '\t',
          'f': '\f',
          'v': '\v',
          'b': '\b'
        };
    
    
        //string evaluation
        //http://stackoverflow.com/questions/24294265/how-to-re-enable-special-character-sequneces-in-javascript
    
        function decodeJsEscape(_, hex0, hex1, octal, other) {
          var hex = hex0 || hex1;
          if (hex) { return String.fromCharCode(parseInt(hex, 16)); }
          if (octal) { return String.fromCharCode(parseInt(octal, 8)); }
          return jsEscapes[other] || other;
        }
    
        function decodeJsString(s) {
          return s.replace(
              // Matches an escape sequence with UTF-16 in group 1, single byte hex in group 2,
              // octal in group 3, and arbitrary other single-character escapes in group 4.
              /\\(?:u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([0-3][0-7]{0,2}|[4-7][0-7]?)|(.))/g,
              decodeJsEscape);
        }
    
        var ARGS = WScript.Arguments;
    
        for (var i=0;i<ARGS.Length;i++) {
            WScript.Echo(decodeJsString(ARGS(i)));
        }
    

    And use it like :

    call decodeStrings.bat  journal\u0027s journal\u0027s
    

    output:

    journal's
    journal's
    

    Here's a script that evaluates files (just pass as much files as you want):

    0</* :
    @echo off
    
        cscript /nologo /E:jscript "%~f0" %*
    
    exit /b %errorlevel% */0;
    
        var ARGS = WScript.Arguments;
    
        if (ARGS.Length < 1 ) {
            WScript.Echo("Wrong arguments");
            WScript.Quit(1);
        }
    
        if (ARGS.Item(0).toLowerCase() == "-help" || ARGS.Item(0).toLowerCase() == "-h") {
            WScript.Echo("Evaluates unicode/special sequences in file");
            WScript.Echo(WScript.ScriptName + " path_to_file [path_to_file]");
            WScript.Quit(0);
    
        }
    
    
        var jsEscapes = {
          'n': '\n',
          'r': '\r',
          't': '\t',
          'f': '\f',
          'v': '\v',
          'b': '\b'
        };
    
    
        //string evaluation
        //http://stackoverflow.com/questions/24294265/how-to-re-enable-special-character-sequneces-in-javascript
    
        function decodeJsEscape(_, hex0, hex1, octal, other) {
          var hex = hex0 || hex1;
          if (hex) { return String.fromCharCode(parseInt(hex, 16)); }
          if (octal) { return String.fromCharCode(parseInt(octal, 8)); }
          return jsEscapes[other] || other;
        }
    
        function decodeJsString(s) {
          return s.replace(
              // Matches an escape sequence with UTF-16 in group 1, single byte hex in group 2,
              // octal in group 3, and arbitrary other single-character escapes in group 4.
              /\\(?:u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([0-3][0-7]{0,2}|[4-7][0-7]?)|(.))/g,
              decodeJsEscape);
        }
    
    
    
    
      function getContent(file) {
            // :: http://www.dostips.com/forum/viewtopic.php?f=3&t=3855&start=15&p=28898  ::
            var ado = WScript.CreateObject("ADODB.Stream");
            ado.Type = 2;  // adTypeText = 2
    
            ado.CharSet = "iso-8859-1";  // code page with minimum adjustments for input
            ado.Open();
            ado.LoadFromFile(file);
    
            var adjustment = "\u20AC\u0081\u201A\u0192\u201E\u2026\u2020\u2021" +
                             "\u02C6\u2030\u0160\u2039\u0152\u008D\u017D\u008F" +
                             "\u0090\u2018\u2019\u201C\u201D\u2022\u2013\u2014" +
                             "\u02DC\u2122\u0161\u203A\u0153\u009D\u017E\u0178" ;
    
    
            var fs = new ActiveXObject("Scripting.FileSystemObject");
            var size = (fs.getFile(file)).size;
    
            var lnkBytes = ado.ReadText(size);
            ado.Close();
            var chars=lnkBytes.split('');
            for (var indx=0;indx<size;indx++) {
                if ( chars[indx].charCodeAt(0) > 255 ) {
                   chars[indx] = String.fromCharCode(128 + adjustment.indexOf(chars[indx]));
                }
            }
            return chars.join("");
       }
    
       function writeContent(file,content) {
            var ado = WScript.CreateObject("ADODB.Stream");
            ado.Type = 2;  // adTypeText = 2
            ado.CharSet = "iso-8859-1";  // right code page for output (no adjustments)
            //ado.Mode=2;
            ado.Open();
    
            ado.WriteText(content);
            ado.SaveToFile(file, 2);
            ado.Close();    
       }
    
        for (var i=0;i<ARGS.Length;i++) {
            WScript.Echo("Processing: "+ARGS.Item(i));
            var content=getContent(ARGS.Item(i));
            writeContent(ARGS.Item(i) , decodeJsString(content));
        }