Search code examples
c++cunicodecharacter-encodingicu

ICU: ucnv_convertEx – detect encoding error on the fly


Is it possible to detect encoding errors with ICU at conversion time, or is it necessary to pre or post check the conversion?

Given the initialization where a conversion from UTF8 to UTF32 is setup:

#include <stdio.h>
#include "unicode/ucnv.h"     /* C Converter API */

static void eval(UConverter* from, UConverter* to);

int main(int argc, char** argv)
{
    UConverter*  from;
    UConverter*  to;
    UErrorCode   status;

    /* Initialize converter from UTF8 to Unicode ___________________________*/
    status = U_ZERO_ERROR;
    from   = ucnv_open("UTF-8", &status);
    if( ! from || ! U_SUCCESS(status) ) return 1; 
    status = U_ZERO_ERROR;
    to     = ucnv_open("UTF32", &status);
    if( ! to || ! U_SUCCESS(status) ) return 1; 
    /*______________________________________________________________________*/

    eval(from, to);
    return 0;
}

Then, applying the conversion using ucnv_convertEx via

static void eval(UConverter* from, UConverter* to) 
{
    UErrorCode  status = U_ZERO_ERROR;
    uint32_t    drain[1024];
    uint32_t*   drain_p = &drain[0];
    uint32_t*   p       = &drain[0];

    /* UTF8 sequence with error in third byte ______________________________*/
    const char  source[] = { "\xED\x8A\x0A\x0A" }; 
    const char* source_p = &source[0];

    ucnv_convertEx(to, from, (char**)&drain_p, (char*)&drain[1024],
                   &source_p, &source[5],
                   NULL, NULL, NULL, NULL, /* reset = */TRUE, /* flush = */TRUE,
                   &status);

    /* Print conversion result _____________________________________________*/
    printf("source_p: source + %i;\n", (int)(source_p - &source[0]));
    printf("status:   %s;\n", u_errorName(status));
    printf("drain:    (n=%i)[", (int)(drain_p - &drain[0]));
    for(p=&drain[0]; p != drain_p ; ++p) { printf("%06X ", (int)*p); }
    printf("]\n");
}

where source contains an inadmissible UTF8 code unit sequence, the function should somehow report an error. Storing the above fragments in "test.c" and compiling the above code with

$ gcc test.c $(icu-config --ldflags) -o test

The output of ./test is (surprisingly):

source_p: source + 5;
status:   U_ZERO_ERROR;
drain:    (n=5)[00FEFF 00FFFD 00000A 00000A 000000 ]

So, no obvious sign of a detected error. Can error detection be done more elegantly than manually checking the content?


Solution

  • As @Eljay suggests in the comments, you can use an error callback. You don't even need to write your own, since the built-in UCNV_TO_U_CALLBACK_STOP will do what you want (ie, return a failure for any bad characters).

    int TestIt()
    {
      UConverter* utf8conv{};
      UConverter* utf32conv{};
      UErrorCode status{ U_ZERO_ERROR };
    
      utf8conv = ucnv_open("UTF8", &status);
    
      if (!U_SUCCESS(status))
      {
        return 1;
      }
    
      utf32conv = ucnv_open("UTF32", &status);
    
      if (!U_SUCCESS(status))
      {
        return 2;
      }
    
      const char source[] =  { "\xED\x8A\x0A\x0A" };
      uint32_t target[10]{ 0 };
    
      ucnv_setToUCallBack(utf8conv, UCNV_TO_U_CALLBACK_STOP, nullptr, 
        nullptr, nullptr, &status);
    
      if (!U_SUCCESS(status))
      {
        return 3;
      }
    
      auto sourcePtr = source;
      auto sourceEnd = source + ARRAYSIZE(source);
      auto targetPtr = target;
      auto targetEnd = reinterpret_cast<const char*>(target + ARRAYSIZE(target));
    
      ucnv_convertEx(utf32conv, utf8conv, reinterpret_cast<char**>(&targetPtr),
        targetEnd, &sourcePtr, sourceEnd, nullptr, nullptr, nullptr, nullptr, 
        TRUE, TRUE, &status);
    
      if (!U_SUCCESS(status))
      {
        return 4;
      }
    
      printf("Converted '%s' to '", source);
      for (auto start = target; start != targetPtr; start++)
      {
        printf("\\x%x", *start);
      }
      printf("'\r\n");
    
      return 0;
    }
    

    This should return 4 for invalid Unicode codepoints, and print out the UTF-32 values if it was successful. It's unlikely we'd get an error from ucnv_setToUCallBack, but we check just in case. In the example above, we pass nullptr for the previous action since we don't care what it was and don't need to reset it.