We have a C++ application deployed on RHEL using ICU.
We have a situation where in we need to convert UChar* to wchar_t* on linux. We use u_strToWCS to perform the conversion.
#include <iostream>
#include <wchar.h>
#include "unicode/ustring.h"
void convertUnicodeStringtoWideChar(const UChar* cuniszSource,
const int32_t cunii32SourceLength,
wchar_t*& rpwcharDestination,
int32_t& destCapacity)
{
UErrorCode uniUErrorCode = U_ZERO_ERROR;
int32_t pDestLength = 0;
rpwcharDestination = 0;
destCapacity = 0;
u_strToWCS(rpwcharDestination,
destCapacity,
&pDestLength,
cuniszSource,
cunii32SourceLength,
&uniUErrorCode);
uniUErrorCode = U_ZERO_ERROR;
rpwcharDestination = new wchar_t[pDestLength+1];
if(rpwcharDestination)
{
destCapacity = pDestLength+1;
u_strToWCS(rpwcharDestination,
destCapacity,
&pDestLength,
cuniszSource,
cunii32SourceLength,
&uniUErrorCode);
destCapacity = wcslen(rpwcharDestination);
}
} //function ends
int main()
{
// a ä Š € ( 王 )
UChar input[20] = { 0x0061, 0x00e4, 0x0160, 0x20ac, 0xd87e, 0xdd29, 0x0000 };
wchar_t * output;
int32_t outlen = 0;
convertUnicodeStringtoWideChar( input, 6, output, outlen );
for ( int i = 0; i < outlen; ++i )
{
std::cout << std::hex << output[i] << "\n";
}
return 0;
}
This works fine for characters entered upto 65535 (as UChar is implemented as uint16_t internally on linux). It fails to convert characters outside Basic Multilingual Plane (eg CJK Unified Ideographs Extension B)
Any ideas on how to perform the conversion?
Update 1: OK. I was looking at wrong directions. u_strToWCS works fine. The problem arises because I need to pass that wide string to a java application on windows using CORBA. Since wchar_t in linux is 32bit, I need to find a way to convert 32bit wchar_t to 16bit wchar_t
Update 2: The code which I have used can be found here
The following is the code to convert UTF-32 encoded wide characters to UTF-16
//Function to convert a Unicode string from platform-specific "wide characters" (wchar_t) to UTF-16.
void ConvertUTF32ToUTF16(wchar_t* source,
const uint32_t sourceLength,
wchar_t*& destination,
uint32_t& destinationLength)
{
wchar_t wcharCharacter;
uint32_t uniui32Counter = 0;
wchar_t* pwszDestinationStart = destination;
wchar_t* sourceStart = source;
if(0 != destination)
{
while(uniui32Counter < sourceLength)
{
wcharCharacter = *source++;
if(wcharCharacter <= 0x0000FFFF)
{
/* UTF-16 surrogate values are illegal in UTF-32
0xFFFF or 0xFFFE are both reserved values */
if(wcharCharacter >= 0xD800 &&
wcharCharacter <= 0xDFFF)
{
*destination++ = 0x0000FFFD;
destinationLength += 1;
}
else
{
/* source is a BMP Character */
destinationLength += 1;
*destination++ = wcharCharacter;
}
}
else if(wcharCharacter > 0x0010FFFF)
{
/* U+10FFFF is the largest code point of Unicode Character Set */
*destination++ = 0x0000FFFD;
destinationLength += 1;
}
else
{
/* source is a character in range 0xFFFF - 0x10FFFF */
wcharCharacter -= 0x0010000UL;
*destination++ = (wchar_t)((wcharCharacter >> 10) + 0xD800);
*destination++ = (wchar_t)((wcharCharacter & 0x3FFUL) + 0xDC00);
destinationLength += 2;
}
++uniui32Counter;
}
destination = pwszDestinationStart;
destination[destinationLength] = '\0';
}
source = sourceStart;
} //function ends