Thursday, April 8, 2010

Conversion between Unicode UTF-16 and UTF-8

There are several possible representations of Unicode text, e.g. UTF-8, UTF-16, UTF-32, etc.

UTF-16 is the default Unicode encoding form used by Windows.

UTF-8 is a common encoding form used to exchange text data on the Internet.
One of the advantages of UTF-8 is that there is no endian problem (i.e. big-endian vs. little-end), because UTF-8 is interpreted just as a sequence of bytes (instead, it is important to specify the correct endiannes of UTF-16 and UTF-32 code units).


//----------------------------------------------------------------------------

// FUNCTION: ConvertUTF8ToUTF16

// DESC: Converts Unicode UTF-8 text to Unicode UTF-16 (Windows default).

//----------------------------------------------------------------------------

wstring ConvertUTF8ToUTF16( IN const string& szTextUTF8)

{

//

// Special case empty input string

//

if(0 == szTextUTF8.length())

{

return L"";

}


//

// Get size of destination UTF-16 buffer, in WCHAR's

//

int cchUTF16 = ::MultiByteToWideChar(

CP_UTF8, // convert from UTF-8

MB_ERR_INVALID_CHARS, // error on invalid chars

szTextUTF8.c_str(), // source UTF-8 string

szTextUTF8.length() + 1,// total length of source UTF-8 string,

// in CHAR's (= bytes), including end-of-string \0

NULL, // unused - no conversion done in this step

0 // request size of destination buffer, in WCHAR's

);


assert( cchUTF16 != 0 );


if ( cchUTF16 == 0 )

{

stringstream ssError;

ssError << "Error in " <<>

OutputDebugStringA(ssError.str().c_str());

return L"";

}

//

// Allocate destination buffer to store UTF-16 string

//

WCHAR * pszUTF16 = new WCHAR[cchUTF16];

//

// Do the conversion from UTF-8 to UTF-16

//

int result = ::MultiByteToWideChar(

CP_UTF8, // convert from UTF-8

MB_ERR_INVALID_CHARS, // error on invalid chars

szTextUTF8.c_str(), // source UTF-8 string

szTextUTF8.length() + 1, // total length of source UTF-8 string,

// in CHAR's (= bytes), including end-of-string \0

pszUTF16, // destination buffer

cchUTF16 // size of destination buffer, in WCHAR's

);


assert( result != 0 );

if ( result == 0 )

{

stringstream ssError;

ssError << "Error in " <<>

OutputDebugStringA(ssError.str().c_str());

}

wstring str(pszUTF16);


delete [] pszUTF16;


// Return resulting UTF16 string

return str;

}


//----------------------------------------------------------------------------

// FUNCTION: ConvertUTF16ToUTF8

// DESC: Converts Unicode UTF-16 (Windows default) text to Unicode UTF-8.

//----------------------------------------------------------------------------

string ConvertUTF16ToUTF8( IN const wstring& szTextUTF16 )

{

//

// Special case of NULL or empty input string

//

if ( 0 == szTextUTF16.length() )

{

// Return empty string

return "";

}

//

// WC_ERR_INVALID_CHARS flag is set to fail if invalid input character

// is encountered.

// This flag is supported on Windows Vista and later.

// Don't use it on Windows XP and previous.

//

#if (WINVER >= 0x0600)

DWORD dwConversionFlags = WC_ERR_INVALID_CHARS;

#else

DWORD dwConversionFlags = 0;

#endif

//

// Get size of destination UTF-8 buffer, in CHAR's (= bytes)

//

int cchUTF8 = ::WideCharToMultiByte(

CP_UTF8, // convert to UTF-8

0, // specify conversion behavior

szTextUTF16.c_str(), // source UTF-16 string

szTextUTF16.length() + 1, // total source string length, in WCHAR's,

// including end-of-string \0

NULL, // unused - no conversion required in this step

0, // request buffer size

NULL, NULL // unused

);

assert( cchUTF8 != 0 );

if ( cchUTF8 == 0 )

{

stringstream ssError;

ssError << "Error in " <<>

OutputDebugStringA(ssError.str().c_str());

return "";

}

//

// Allocate destination buffer for UTF-8 string

//

CHAR * pszUTF8 = new CHAR[cchUTF8];

//

// Do the conversion from UTF-16 to UTF-8

//

int result = ::WideCharToMultiByte(

CP_UTF8, // convert to UTF-8

0, // specify conversion behavior

szTextUTF16.c_str(), // source UTF-16 string

szTextUTF16.length() + 1, // total source string length, in WCHAR's,

// including end-of-string \0

pszUTF8, // destination buffer

cchUTF8, // destination buffer size, in bytes

NULL, NULL // unused

);


assert( result != 0 );

if ( result == 0 )

{

stringstream ssError;

ssError << "Error in " <<>

OutputDebugStringA(ssError.str().c_str());

}

string str(pszUTF8);

delete [] pszUTF8;

// Return resulting UTF-8 string

return str;

}

References: UTF-8, UTF-16, UTF-32 & BOM

No comments:

Post a Comment