📄 utf.c
字号:
return (ui32)-1;
}
utf8_len -= code_len;
/*-----------------------------------------------------------------*/
/* Convert based on code length encoding. */
/*-----------------------------------------------------------------*/
switch (code_len)
{
/*---------------------------------------------------------------*/
/* 1-byte encoding. */
/*---------------------------------------------------------------*/
case 1:
{
/*-------------------------------------------------------------*/
/* This converts directly into a UTF16 code. */
/*-------------------------------------------------------------*/
*utf16 = byte1;
break;
}
/*---------------------------------------------------------------*/
/* 2-byte encoding. */
/*---------------------------------------------------------------*/
case 2:
{
/*-------------------------------------------------------------*/
/* Get the next byte and advance string pointer. */
/*-------------------------------------------------------------*/
byte2 = *utf8++;
/*-------------------------------------------------------------*/
/* Combine the 2 bytes to get a UTF16 code. */
/*-------------------------------------------------------------*/
*utf16 = ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
break;
}
/*---------------------------------------------------------------*/
/* 3-byte encoding. */
/*---------------------------------------------------------------*/
case 3:
{
/*-------------------------------------------------------------*/
/* Get the next 2 bytes and advance string pointer. */
/*-------------------------------------------------------------*/
byte2 = *utf8++;
byte3 = *utf8++;
/*-------------------------------------------------------------*/
/* Combine the 3 bytes to get a UTF16 code. */
/*-------------------------------------------------------------*/
*utf16 = ((byte1 & 0xF) << 12) | ((byte2 & 0x3F) << 6) |
(byte3 & 0x3F);
break;
}
/*---------------------------------------------------------------*/
/* 4-byte encoding. */
/*---------------------------------------------------------------*/
case 4:
{
/*-------------------------------------------------------------*/
/* Get the next 3 bytes and advance string pointer. */
/*-------------------------------------------------------------*/
byte2 = *utf8++;
byte3 = *utf8++;
byte4 = *utf8++;
/*-------------------------------------------------------------*/
/* This encoding translates into 2 UTF16 codes. */
/*-------------------------------------------------------------*/
u = ((byte1 & 0x7) << 2) | ((byte2 & 0x30) >> 4);
u -= 1;
x1 = ((byte2 & 0xF) << 2) | ((byte3 & 0x30) >> 4);
x2 = ((byte3 & 0xF) << 6) | (byte4 & 0x3F);
/*-------------------------------------------------------------*/
/* Set the first (surrogate) code of the UTF16 code. */
/*-------------------------------------------------------------*/
*utf16 = 0xD800 | (u << 6) | x1;
/*-------------------------------------------------------------*/
/* Check there is room for the additional code. */
/*-------------------------------------------------------------*/
if (++utf16_i >= max_utf16_len)
{
set_errno(ENOSPC);
return (ui32)-1;
}
/*-------------------------------------------------------------*/
/* Set the next code. */
/*-------------------------------------------------------------*/
*(++utf16) = 0xDC00 | x2;
break;
}
/*---------------------------------------------------------------*/
/* Shouldn't be here. */
/*---------------------------------------------------------------*/
default:
PfAssert(FALSE); /*lint !e506, !e774*/
set_errno(EINVAL);
return (ui32)-1;
}
}
return utf16_i;
}
/***********************************************************************/
/* UTF16_UTF8: Convert a UTF16 encoded string to a UTF8 encoded one. */
/* The UTF16 string must be properly encoded. */
/* */
/* Inputs: utf16 = original UTF16 encoded string */
/* utf16_len = length of UTF16 encoded string */
/* utf8 = place to store the conversion */
/* max_utf8_len = length (in bytes) of UTF8 string */
/* */
/* Returns: Length of UTF8 string, -1 on error */
/* */
/***********************************************************************/
ui32 UTF16_UTF8(const ui16 *utf16, ui32 utf16_len, ui8 *utf8,
ui32 max_utf8_len)
{
ui32 utf8_i, u;
ui16 word1, word2;
/*-------------------------------------------------------------------*/
/* Check the arguments are valid. */
/*-------------------------------------------------------------------*/
if (utf16 == NULL || utf8 == NULL || max_utf8_len == 0)
{
set_errno(EINVAL);
return (ui32)-1;
}
/*-------------------------------------------------------------------*/
/* An empty UTF16 string converts to an empty UTF8 string. */
/*-------------------------------------------------------------------*/
if (utf16_len == 0)
return 0;
/*-------------------------------------------------------------------*/
/* Go through UTF16 string code by code, converting each one. */
/*-------------------------------------------------------------------*/
for (utf8_i = 0, word1 = *utf16++; utf16_len; word1 = *utf16++,
--utf16_len)
{
/*-----------------------------------------------------------------*/
/* If surrogate (0xD800 to 0xDFFF), UTF16 code encoded in two */
/* words and represents values >= 0x10000 which translate into */
/* 4-byte encoded UTF8 codes. */
/*-----------------------------------------------------------------*/
if (word1 >= 0xD800 && word1 <= 0xDFFF)
{
/*---------------------------------------------------------------*/
/* Check there is enough room left in UTF8 string. */
/*---------------------------------------------------------------*/
if (utf8_i + 4 >= max_utf8_len)
{
set_errno(ENOSPC);
return (ui32)-1;
}
utf8_i += 4;
/*---------------------------------------------------------------*/
/* Read the second UTF16 word. */
/*---------------------------------------------------------------*/
if (utf16_len == 0)
{
set_errno(EINVAL);
return (ui32)-1;
}
word2 = *utf16++;
--utf16_len;
/*---------------------------------------------------------------*/
/* Combine the two UTF16 words to get 4 UTF8 bytes. */
/*---------------------------------------------------------------*/
u = (word1 & 0x3C0) >> 6;
++u;
*utf8++ = 0xF0 | ((u & 0x1C) >> 2);
*utf8++ = 0x80 | ((u & 0x3) << 4) | ((word1 & 0x3C) >> 2);
*utf8++ = 0x80 | ((word1 & 0x3) << 4) | ((word2 & 0x3C0) >> 6);
*utf8++ = 0x80 | (word2 & 0x3F);
}
/*-----------------------------------------------------------------*/
/* If UTF16 code less than 0x80 it's a 1-byte UTF8 encoded code. */
/*-----------------------------------------------------------------*/
else if (word1 <= 0x7F)
{
/*---------------------------------------------------------------*/
/* Check there is enough room left in UTF8 string. */
/*---------------------------------------------------------------*/
if (++utf8_i >= max_utf8_len)
{
set_errno(ENOSPC);
return (ui32)-1;
}
/*---------------------------------------------------------------*/
/* Direct conversion into UTF8 code. */
/*---------------------------------------------------------------*/
*utf8++ = (ui8)word1;
}
/*-----------------------------------------------------------------*/
/* If UTF16 code less than 0x800 it's a 2-byte UTF8 encoded code. */
/*-----------------------------------------------------------------*/
else if (word1 <= 0x7FF)
{
/*---------------------------------------------------------------*/
/* Check there is enough room left in UTF8 string. */
/*---------------------------------------------------------------*/
if (utf8_i + 2 >= max_utf8_len)
{
set_errno(ENOSPC);
return (ui32)-1;
}
utf8_i += 2;
/*---------------------------------------------------------------*/
/* Form the 2-byte UTF8 encoding. */
/*---------------------------------------------------------------*/
*utf8++ = 0xC0 | ((word1 & 0x7C0) >> 6);
*utf8++ = 0x80 | (word1 & 0x3F);
}
/*-----------------------------------------------------------------*/
/* Else it's a 3-byte UTF8 encoded code. */
/*-----------------------------------------------------------------*/
else
{
/*---------------------------------------------------------------*/
/* Check there is enough room left in UTF8 string. */
/*---------------------------------------------------------------*/
if (utf8_i + 3 >= max_utf8_len)
{
set_errno(ENOSPC);
return (ui32)-1;
}
utf8_i += 3;
/*---------------------------------------------------------------*/
/* Form the 3-byte UTF8 encoding. */
/*---------------------------------------------------------------*/
*utf8++ = 0xE0 | ((word1 & 0xF000) >> 12);
*utf8++ = 0x80 | ((word1 & 0xFC0) >> 6);
*utf8++ = 0x80 | (word1 & 0x3F);
}
}
return utf8_i;
}
#endif /* UTF_ENABLED */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -