There are several possible representations of Unicode text, e.g. UTF-8, UTF-16, UTF-32, etc.
UTF-16 is the default Unicode encoding form used by Windows.
UTF-8 is a common encoding form used to exchange text data on the Internet.
One of the advantages of UTF-8 is that there is no endian problem (i.e. big-endian vs. little-end), because UTF-8 is interpreted just as a sequence of bytes (instead, it is important to specify the correct endiannes of UTF-16 and UTF-32 code units).
//----------------------------------------------------------------------------
// FUNCTION: ConvertUTF8ToUTF16
// DESC: Converts Unicode UTF-8 text to Unicode UTF-16 (Windows default).
//----------------------------------------------------------------------------
wstring ConvertUTF8ToUTF16( IN const string& szTextUTF8)
{
//
// Special case empty input string
//
if(0 == szTextUTF8.length())
{
return L"";
}
//
// Get size of destination UTF-16 buffer, in WCHAR's
//
int cchUTF16 = ::MultiByteToWideChar(
CP_UTF8, // convert from UTF-8
MB_ERR_INVALID_CHARS, // error on invalid chars
szTextUTF8.c_str(), // source UTF-8 string
szTextUTF8.length() + 1,// total length of source UTF-8 string,
// in CHAR's (= bytes), including end-of-string \0
NULL, // unused - no conversion done in this step
0 // request size of destination buffer, in WCHAR's
);
assert( cchUTF16 != 0 );
if ( cchUTF16 == 0 )
{
stringstream ssError;
ssError << "Error in " <<>
OutputDebugStringA(ssError.str().c_str());
return L"";
}
//
// Allocate destination buffer to store UTF-16 string
//
WCHAR * pszUTF16 = new WCHAR[cchUTF16];
//
// Do the conversion from UTF-8 to UTF-16
//
int result = ::MultiByteToWideChar(
CP_UTF8, // convert from UTF-8
MB_ERR_INVALID_CHARS, // error on invalid chars
szTextUTF8.c_str(), // source UTF-8 string
szTextUTF8.length() + 1, // total length of source UTF-8 string,
// in CHAR's (= bytes), including end-of-string \0
pszUTF16, // destination buffer
cchUTF16 // size of destination buffer, in WCHAR's
);
assert( result != 0 );
if ( result == 0 )
{
stringstream ssError;
ssError << "Error in " <<>
OutputDebugStringA(ssError.str().c_str());
}
wstring str(pszUTF16);
delete [] pszUTF16;
// Return resulting UTF16 string
return str;
}
//----------------------------------------------------------------------------
// FUNCTION: ConvertUTF16ToUTF8
// DESC: Converts Unicode UTF-16 (Windows default) text to Unicode UTF-8.
//----------------------------------------------------------------------------
string ConvertUTF16ToUTF8( IN const wstring& szTextUTF16 )
{
//
// Special case of NULL or empty input string
//
if ( 0 == szTextUTF16.length() )
{
// Return empty string
return "";
}
//
// WC_ERR_INVALID_CHARS flag is set to fail if invalid input character
// is encountered.
// This flag is supported on Windows Vista and later.
// Don't use it on Windows XP and previous.
//
#if (WINVER >= 0x0600)
DWORD dwConversionFlags = WC_ERR_INVALID_CHARS;
#else
DWORD dwConversionFlags = 0;
#endif
//
// Get size of destination UTF-8 buffer, in CHAR's (= bytes)
//
int cchUTF8 = ::WideCharToMultiByte(
CP_UTF8, // convert to UTF-8
0, // specify conversion behavior
szTextUTF16.c_str(), // source UTF-16 string
szTextUTF16.length() + 1, // total source string length, in WCHAR's,
// including end-of-string \0
NULL, // unused - no conversion required in this step
0, // request buffer size
NULL, NULL // unused
);
assert( cchUTF8 != 0 );
if ( cchUTF8 == 0 )
{
stringstream ssError;
ssError << "Error in " <<>
OutputDebugStringA(ssError.str().c_str());
return "";
}
//
// Allocate destination buffer for UTF-8 string
//
CHAR * pszUTF8 = new CHAR[cchUTF8];
//
// Do the conversion from UTF-16 to UTF-8
//
int result = ::WideCharToMultiByte(
CP_UTF8, // convert to UTF-8
0, // specify conversion behavior
szTextUTF16.c_str(), // source UTF-16 string
szTextUTF16.length() + 1, // total source string length, in WCHAR's,
// including end-of-string \0
pszUTF8, // destination buffer
cchUTF8, // destination buffer size, in bytes
NULL, NULL // unused
);
assert( result != 0 );
if ( result == 0 )
{
stringstream ssError;
ssError << "Error in " <<>
OutputDebugStringA(ssError.str().c_str());
}
string str(pszUTF8);
delete [] pszUTF8;
// Return resulting UTF-8 string
return str;
}
References: UTF-8, UTF-16, UTF-32 & BOM
No comments:
Post a Comment