📄 utf.c
字号:
/***********************************************************************/
/* */
/* Module: utf.c */
/* Release: 2004.5 */
/* Version: 2004.0 */
/* Purpose: Unicode encoding related functions */
/* */
/*---------------------------------------------------------------------*/
/* */
/* Copyright 2004, Blunk Microsystems */
/* ALL RIGHTS RESERVED */
/* */
/* Licensees have the non-exclusive right to use, modify, or extract */
/* this computer program for software development at a single site. */
/* This program may be resold or disseminated in executable format */
/* only. The source code may not be redistributed or resold. */
/* */
/***********************************************************************/
#include "../posix.h"
#include "../include/fsprivate.h"
#include "../include/libc/errno.h"
#if UTF_ENABLED
/***********************************************************************/
/* Variable Declarations */
/***********************************************************************/
static ui8 LengthLookup[] =
{
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4
};
/***********************************************************************/
/* Global Function Definitions */
/***********************************************************************/
/***********************************************************************/
/* ValidUTF8: Check if a string is properly encoded in UTF8 */
/* */
/* Inputs: utf8 = string to check */
/* utf8_len = length of UTF8 encoded string */
/* */
/* Returns: TRUE if string UTF8 encoded, FALSE otherwise */
/* */
/***********************************************************************/
int ValidUTF8(const ui8 *utf8, ui32 utf8_len)
{
ui32 code_len;
ui8 byte1, byte2, byte3, byte4;
/*-------------------------------------------------------------------*/
/* Check the string is valid. */
/*-------------------------------------------------------------------*/
if (utf8 == NULL)
return FALSE;
/*-------------------------------------------------------------------*/
/* An empty string is correctly encoded. */
/*-------------------------------------------------------------------*/
if (utf8_len == 0)
return TRUE;
/*-------------------------------------------------------------------*/
/* Look at the UTF8 encoded string a code at a time. */
/*-------------------------------------------------------------------*/
for (byte1 = *utf8++; utf8_len; byte1 = *utf8++)
{
/*-----------------------------------------------------------------*/
/* Determine how many bytes the next UTF8 encoded code has. */
/*-----------------------------------------------------------------*/
code_len = LengthLookup[(byte1 & 0xF0) >> 4];
/*-----------------------------------------------------------------*/
/* If the byte does not start with 0xxx or 110x or 1110 or 1111 */
/* it's not properly formated. */
/*-----------------------------------------------------------------*/
if (code_len == 0)
return FALSE;
/*-----------------------------------------------------------------*/
/* If the code is encoded in more bytes than what is left, error. */
/*-----------------------------------------------------------------*/
if (code_len > utf8_len)
return FALSE;
utf8_len -= code_len;
/*-----------------------------------------------------------------*/
/* Check code based on the number of bytes it is encoded in. */
/*-----------------------------------------------------------------*/
switch (code_len)
{
/*---------------------------------------------------------------*/
/* 4-byte encoding. */
/*---------------------------------------------------------------*/
case 4:
{
/*-------------------------------------------------------------*/
/* Get the next 3 bytes and advance string pointer. */
/*-------------------------------------------------------------*/
byte2 = *utf8++;
byte3 = *utf8++;
byte4 = *utf8++;
/*-------------------------------------------------------------*/
/* Maximum Unicode value is 0x10FFFF. Check if code conforms. */
/*-------------------------------------------------------------*/
if (byte1 > 0xF4)
return FALSE;
/*-------------------------------------------------------------*/
/* Minimum 4-byte encoding is 0x10000. Check if code conforms. */
/*-------------------------------------------------------------*/
if (byte1 == 0xF0 && byte2 < 0x90)
return FALSE;
/*-------------------------------------------------------------*/
/* Check 2nd, 3rd and 4th bytes are in valid range. */
/*-------------------------------------------------------------*/
if (byte2 < 0x80 || byte2 > 0xBF ||
byte3 < 0x80 || byte3 > 0xBF ||
byte4 < 0x80 || byte4 > 0xBF)
return FALSE;
break;
}
/*---------------------------------------------------------------*/
/* 3-byte encoding. */
/*---------------------------------------------------------------*/
case 3:
{
/*-------------------------------------------------------------*/
/* Get the next 2 bytes and advance string pointer. */
/*-------------------------------------------------------------*/
byte2 = *utf8++;
byte3 = *utf8++;
/*-------------------------------------------------------------*/
/* If code represents values between 0xD800 and 0xDFFF it's an */
/* illegal code. These are surrogate values, not valid stand */
/* alone codes. */
/*-------------------------------------------------------------*/
if (byte1 == 0xED && byte2 >= 0xA0 && byte2 <= 0xBF)
return FALSE;
/*-------------------------------------------------------------*/
/* Minimum 3-byte encoding is 0x800. Check if code conforms. */
/*-------------------------------------------------------------*/
if (byte1 == 0xE0 && byte2 < 0xA0)
return FALSE;
/*-------------------------------------------------------------*/
/* Check 2nd and 3rd bytes are in valid range. */
/*-------------------------------------------------------------*/
if (byte2 < 0x80 || byte2 > 0xBF || byte3 < 0x80 || byte3 > 0xBF)
return FALSE;
break;
}
/*---------------------------------------------------------------*/
/* 2-byte encoding. */
/*---------------------------------------------------------------*/
case 2:
{
/*-------------------------------------------------------------*/
/* Get the next byte and advance string pointer. */
/*-------------------------------------------------------------*/
byte2 = *utf8++;
/*-------------------------------------------------------------*/
/* Minimum 2-byte encoding is 0x80. Check if code conforms. */
/*-------------------------------------------------------------*/
if (byte1 < 0xC2)
return FALSE;
/*-------------------------------------------------------------*/
/* Check 2nd byte is in valid range. */
/*-------------------------------------------------------------*/
if (byte2 < 0x80 || byte2 > 0xBF)
return FALSE;
break;
}
/*---------------------------------------------------------------*/
/* 1-byte encoding. Nothing to do. */
/*---------------------------------------------------------------*/
case 1:
break;
/*---------------------------------------------------------------*/
/* Code cannot have length other than in 1 .. 4. */
/*---------------------------------------------------------------*/
default:
PfAssert(FALSE); /*lint !e506, !e774*/
return FALSE;
}
}
return TRUE;
}
/***********************************************************************/
/* UTF8_UTF16: Convert a UTF8 encoded string to a UTF16 encoded one. */
/* The UTF8 string must be properly encoded. */
/* */
/* Inputs: utf8 = original UTF8 encoded string */
/* utf8_len = length of UTF8 encoded string */
/* utf16 = place to store the conversion */
/* max_utf16_len = length (in bytes) of utf16 string */
/* */
/* Returns: Length of UTF16 string, -1 on error */
/* */
/***********************************************************************/
ui32 UTF8_UTF16(const ui8 *utf8, ui32 utf8_len, ui16 *utf16,
ui32 max_utf16_len)
{
ui32 utf16_i = 0, code_len, x2;
ui8 byte1, byte2, byte3, byte4, u, x1;
/*-------------------------------------------------------------------*/
/* Check the arguments are valid. */
/*-------------------------------------------------------------------*/
if (utf8 == NULL || utf16 == NULL || max_utf16_len == 0)
{
set_errno(EINVAL);
return (ui32)-1;
}
/*-------------------------------------------------------------------*/
/* An empty UTF8 string converts to an empty UTF16 string. */
/*-------------------------------------------------------------------*/
if (utf8_len == 0)
return 0;
/*-------------------------------------------------------------------*/
/* Go through UTF8 string code by code, converting each one. */
/*-------------------------------------------------------------------*/
for (byte1 = *utf8++; utf8_len; byte1 = *utf8++, ++utf16_i, ++utf16)
{
/*-----------------------------------------------------------------*/
/* If we've reached the maximum UTF16 length, out of space. */
/*-----------------------------------------------------------------*/
if (utf16_i >= max_utf16_len)
{
set_errno(ENOSPC);
return (ui32)-1;
}
/*-----------------------------------------------------------------*/
/* Lookup the number of bytes for current code encoding. */
/*-----------------------------------------------------------------*/
code_len = LengthLookup[(byte1 & 0xF0) >> 4];
if (code_len == 0)
{
set_errno(EINVAL);
return (ui32)-1;
}
if (code_len > utf8_len)
{
set_errno(EINVAL);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -