Search code examples
javascriptdatabaseutf-8codepages

Properly parsing mixed codepage strings?


I have this weird state that my db got into. Basically, some text is in mixed mode. windows-1252 and miscoded utf-8. Here's a sample:

"donc d'être transparent avec lui et surtout pas de minimiser à l’oral pour le charger à l'écrit".

If I treat the string as utf8, the "’" will correctly become an a apostrophe, but then all the accented e's get converted to some nonsense.

If I treat it as windows-1252, the accented e's stay in place but then the "’" does not become an apostrophe.

Is there some way of explicitly converting only the utf-8 stuff?


Solution

    1. Get windows-1252 codes iterating the string character by character (see function get8bitCodeArray).
    2. Build the output string (myNewStr): seek for utf-8 byte sequences in the array of windows-1252 codes and
    • if found such a sequence of at least 2 bytes, decode it (see TextDecoder) and use the decoded character;
    • otherwise, use the original character.

    const cp1252dict = {'€': 128, '‚': 130, 'ƒ': 131, '„': 132, '…': 133, '†': 134, '‡': 135, 'ˆ': 136, '‰': 137, 'Š': 138, '‹': 139, 'Œ': 140, 'Ž': 142, '‘': 145, '’': 146, '“': 147, '”': 148, '•': 149, '–': 150, '—': 151, '˜': 152, '™': 153, 'š': 154, '›': 155, 'œ': 156, 'ž': 158, 'Ÿ': 159}
    function get8bitCodeArray(aString) {
        var auxArr = aString.split('')
        for (var ii = 0; ii < aString.length; ii++) {
            auxChar = auxArr[ii]
            auxCode = auxChar.charCodeAt()
            if ( auxCode < 256 ) {
                auxArr[ii] = auxCode
            }
            else {
                auxArr[ii] = cp1252dict[auxChar]
            }
        }
        return auxArr
    }
    let decoder = new TextDecoder('utf-8');
    var myOldStr = "d'être … à l’oral … à l'écrit"
    var mySArray = get8bitCodeArray(myOldStr);
    var myNewStr = ""
    var arrLength = mySArray.length;
    const mask = new Array(128, 224, 240, 248, 192);
    const rslt = new Array(0, 192, 224, 240, 128);
    for (var ii = 0; ii < arrLength; ii++) {
        //  // console.log(ii, mySArray[ii]);
        if (    (((mySArray[ii+0] & mask[0]) === rslt[0])) ) {
            // console.log(ii, myOldStr.charAt(ii), mySArray[ii], 'ascii');
            myNewStr += myOldStr.charAt(ii);
        }
        else if (((mySArray[ii+0] & mask[+1]) === rslt[+1]) &&
                 (ii + 1 <= arrLength) &&
                 ((mySArray[ii+1] & mask[4]) === rslt[4]) ) {
            var ui8Arr = new Uint8Array(mySArray.slice(ii, ii+2));
            // console.log(ii, myOldStr.charAt(ii), mySArray[ii], '2');
            myNewStr += decoder.decode(ui8Arr);
            ii += 1;
        }
        else if (((mySArray[ii+0] & mask[+2]) === rslt[+2]) &&
                 (ii + 2 <= arrLength) &&
                 ((mySArray[ii+1] & mask[4]) === rslt[4]) &&
                 ((mySArray[ii+2] & mask[4]) === rslt[4]) ) {
            var ui8Arr = new Uint8Array(mySArray.slice(ii, ii+3));
            // console.log(ii, myOldStr.charAt(ii), mySArray[ii], '3');
            myNewStr += decoder.decode(ui8Arr);
            ii += 2;
        }
        else if (((mySArray[ii+0] & mask[3]) === rslt[3]) &&
                 (ii + 3 <= arrLength) &&
                 ((mySArray[ii+1] & mask[4]) === rslt[4]) &&
                 ((mySArray[ii+2] & mask[4]) === rslt[4]) &&
                 ((mySArray[ii+3] & mask[4]) === rslt[4])) {
            var ui8Arr = new Uint8Array(mySArray.slice(ii, ii+4));
            // console.log(ii, myOldStr.charAt(ii), mySArray[ii], '4');
            myNewStr += decoder.decode(ui8Arr);
            ii += 3;
        }
        else {
            // console.log(ii, myOldStr.charAt(ii), mySArray[ii], 'else');
            myNewStr += myOldStr.charAt(ii);
        }
    };
    console.log(myOldStr);
    console.log(myNewStr);

    Sorry, I'm a JavaScript noob so my code could look a bit artless.


    The key parts for above code snippet were computed in Python as follows:

    mask and rslt arrays:

    UTF8_BIN_PATTERNS = [
      '0xxxxxxx', #  U+0000..U+007F
      '110xxxxx', #  U+0080..U+07FF
      '1110xxxx', #  U+0800..U+FFFF
      '11110xxx', # U+10000..U+10FFFF
      '10xxxxxx'  # Continuation bytes
    ]             # https://en.wikipedia.org/wiki/UTF-8#Encoding
    UTF8_BIN_MASKSS = [_.replace( '0', '1')
                        .replace( 'x', '0') for _ in UTF8_BIN_PATTERNS]
    UTF8_BIN_VALUES = [_.replace( 'x', '0') for _ in UTF8_BIN_PATTERNS]
    INT_MASKSS = [int( _, 2) for _ in UTF8_BIN_MASKSS]
    INT_VALUES = [int( _, 2) for _ in UTF8_BIN_VALUES]
    # debugging print
    # print( UTF8_BIN_PATTERNS, UTF8_BIN_MASKSS, UTF8_BIN_VALUES, sep = '\n')
    print( INT_MASKSS, INT_VALUES, sep = '\n')
    
    [128, 224, 240, 248, 192]
    [0, 192, 224, 240, 128]
    

    cp1252dict dictionary:

    cp1252dict = {}
    for ii in range(128,256):
        aux_chr = ii.to_bytes(1, 'big').decode('cp1252', 'replace')
        aux_ord = ord(aux_chr)
        if ( aux_ord > 255 and        # a valid character above 8 bits
             aux_ord != 65533):       # replacement character
            cp1252dict[aux_chr] = ii
    
    cp1252dict
    
    
    {'€': 128, '‚': 130, 'ƒ': 131, '„': 132, '…': 133, '†': 134, '‡': 135, 'ˆ': 136, '‰': 137, 'Š': 138, '‹': 139, 'Œ': 140, 'Ž': 142, '‘': 145, '’': 146, '“': 147, '”': 148, '•': 149, '–': 150, '—': 151, '˜': 152, '™': 153, 'š': 154, '›': 155, 'œ': 156, 'ž': 158, 'Ÿ': 159}