📄 utf.c

📁 ATMEL单片机可用的文件系统源代码
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/***********************************************************************/
/*                                                                     */
/*   Module:  utf.c                                                    */
/*   Release: 2004.5                                                   */
/*   Version: 2004.0                                                   */
/*   Purpose: Unicode encoding related functions                       */
/*                                                                     */
/*---------------------------------------------------------------------*/
/*                                                                     */
/*               Copyright 2004, Blunk Microsystems                    */
/*                      ALL RIGHTS RESERVED                            */
/*                                                                     */
/*   Licensees have the non-exclusive right to use, modify, or extract */
/*   this computer program for software development at a single site.  */
/*   This program may be resold or disseminated in executable format   */
/*   only. The source code may not be redistributed or resold.         */
/*                                                                     */
/***********************************************************************/
#include "../posix.h"
#include "../include/fsprivate.h"
#include "../include/libc/errno.h"

#if UTF_ENABLED
/***********************************************************************/
/* Variable Declarations                                               */
/***********************************************************************/
static ui8 LengthLookup[] =
{
  1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4
};

/***********************************************************************/
/* Global Function Definitions                                         */
/***********************************************************************/

/***********************************************************************/
/*   ValidUTF8: Check if a string is properly encoded in UTF8          */
/*                                                                     */
/*      Inputs: utf8 = string to check                                 */
/*              utf8_len = length of UTF8 encoded string               */
/*                                                                     */
/*     Returns: TRUE if string UTF8 encoded, FALSE otherwise           */
/*                                                                     */
/***********************************************************************/
int ValidUTF8(const ui8 *utf8, ui32 utf8_len)
{
  ui32 code_len;
  ui8 byte1, byte2, byte3, byte4;

  /*-------------------------------------------------------------------*/
  /* Check the string is valid.                                        */
  /*-------------------------------------------------------------------*/
  if (utf8 == NULL)
    return FALSE;

  /*-------------------------------------------------------------------*/
  /* An empty string is correctly encoded.                             */
  /*-------------------------------------------------------------------*/
  if (utf8_len == 0)
    return TRUE;

  /*-------------------------------------------------------------------*/
  /* Look at the UTF8 encoded string a code at a time.                 */
  /*-------------------------------------------------------------------*/
  for (byte1 = *utf8++; utf8_len; byte1 = *utf8++)
  {
    /*-----------------------------------------------------------------*/
    /* Determine how many bytes the next UTF8 encoded code has.        */
    /*-----------------------------------------------------------------*/
    code_len = LengthLookup[(byte1 & 0xF0) >> 4];

    /*-----------------------------------------------------------------*/
    /* If the byte does not start with 0xxx or 110x or 1110 or 1111    */
    /* it's not properly formated.                                     */
    /*-----------------------------------------------------------------*/
    if (code_len == 0)
      return FALSE;

    /*-----------------------------------------------------------------*/
    /* If the code is encoded in more bytes than what is left, error.  */
    /*-----------------------------------------------------------------*/
    if (code_len > utf8_len)
      return FALSE;
    utf8_len -= code_len;

    /*-----------------------------------------------------------------*/
    /* Check code based on the number of bytes it is encoded in.       */
    /*-----------------------------------------------------------------*/
    switch (code_len)
    {
      /*---------------------------------------------------------------*/
      /* 4-byte encoding.                                              */
      /*---------------------------------------------------------------*/
      case 4:
      {
        /*-------------------------------------------------------------*/
        /* Get the next 3 bytes and advance string pointer.            */
        /*-------------------------------------------------------------*/
        byte2 = *utf8++;
        byte3 = *utf8++;
        byte4 = *utf8++;

        /*-------------------------------------------------------------*/
        /* Maximum Unicode value is 0x10FFFF. Check if code conforms.  */
        /*-------------------------------------------------------------*/
        if (byte1 > 0xF4)
          return FALSE;

        /*-------------------------------------------------------------*/
        /* Minimum 4-byte encoding is 0x10000. Check if code conforms. */
        /*-------------------------------------------------------------*/
        if (byte1 == 0xF0 && byte2 < 0x90)
          return FALSE;

        /*-------------------------------------------------------------*/
        /* Check 2nd, 3rd and 4th bytes are in valid range.            */
        /*-------------------------------------------------------------*/
        if (byte2 < 0x80 || byte2 > 0xBF ||
            byte3 < 0x80 || byte3 > 0xBF ||
            byte4 < 0x80 || byte4 > 0xBF)
          return FALSE;

        break;
      }

      /*---------------------------------------------------------------*/
      /* 3-byte encoding.                                              */
      /*---------------------------------------------------------------*/
      case 3:
      {
        /*-------------------------------------------------------------*/
        /* Get the next 2 bytes and advance string pointer.            */
        /*-------------------------------------------------------------*/
        byte2 = *utf8++;
        byte3 = *utf8++;

        /*-------------------------------------------------------------*/
        /* If code represents values between 0xD800 and 0xDFFF it's an */
        /* illegal code. These are surrogate values, not valid stand   */
        /* alone codes.                                                */
        /*-------------------------------------------------------------*/
        if (byte1 == 0xED && byte2 >= 0xA0 && byte2 <= 0xBF)
          return FALSE;

        /*-------------------------------------------------------------*/
        /* Minimum 3-byte encoding is 0x800. Check if code conforms.   */
        /*-------------------------------------------------------------*/
        if (byte1 == 0xE0 && byte2 < 0xA0)
          return FALSE;

        /*-------------------------------------------------------------*/
        /* Check 2nd and 3rd bytes are in valid range.                 */
        /*-------------------------------------------------------------*/
        if (byte2 < 0x80 || byte2 > 0xBF || byte3 < 0x80 || byte3 > 0xBF)
          return FALSE;

        break;
      }

      /*---------------------------------------------------------------*/
      /* 2-byte encoding.                                              */
      /*---------------------------------------------------------------*/
      case 2:
      {
        /*-------------------------------------------------------------*/
        /* Get the next byte and advance string pointer.               */
        /*-------------------------------------------------------------*/
        byte2 = *utf8++;

        /*-------------------------------------------------------------*/
        /* Minimum 2-byte encoding is 0x80. Check if code conforms.    */
        /*-------------------------------------------------------------*/
        if (byte1 < 0xC2)
          return FALSE;

        /*-------------------------------------------------------------*/
        /* Check 2nd byte is in valid range.                           */
        /*-------------------------------------------------------------*/
        if (byte2 < 0x80 || byte2 > 0xBF)
          return FALSE;

        break;
      }

      /*---------------------------------------------------------------*/
      /* 1-byte encoding. Nothing to do.                               */
      /*---------------------------------------------------------------*/
      case 1:
        break;

      /*---------------------------------------------------------------*/
      /* Code cannot have length other than in 1 .. 4.                 */
      /*---------------------------------------------------------------*/
      default:
        PfAssert(FALSE); /*lint !e506, !e774*/
        return FALSE;
    }
  }
  return TRUE;
}

/***********************************************************************/
/*  UTF8_UTF16: Convert a UTF8 encoded string to a UTF16 encoded one.  */
/*              The UTF8 string must be properly encoded.              */
/*                                                                     */
/*      Inputs: utf8 = original UTF8 encoded string                    */
/*              utf8_len = length of UTF8 encoded string               */
/*              utf16 = place to store the conversion                  */
/*              max_utf16_len = length (in bytes) of utf16 string      */
/*                                                                     */
/*     Returns: Length of UTF16 string, -1 on error                    */
/*                                                                     */
/***********************************************************************/
ui32 UTF8_UTF16(const ui8 *utf8, ui32 utf8_len, ui16 *utf16,
                ui32 max_utf16_len)
{
  ui32 utf16_i = 0, code_len, x2;
  ui8 byte1, byte2, byte3, byte4, u, x1;

  /*-------------------------------------------------------------------*/
  /* Check the arguments are valid.                                    */
  /*-------------------------------------------------------------------*/
  if (utf8 == NULL || utf16 == NULL || max_utf16_len == 0)
  {
    set_errno(EINVAL);
    return (ui32)-1;
  }

  /*-------------------------------------------------------------------*/
  /* An empty UTF8 string converts to an empty UTF16 string.           */
  /*-------------------------------------------------------------------*/
  if (utf8_len == 0)
    return 0;

  /*-------------------------------------------------------------------*/
  /* Go through UTF8 string code by code, converting each one.         */
  /*-------------------------------------------------------------------*/
  for (byte1 = *utf8++; utf8_len; byte1 = *utf8++, ++utf16_i, ++utf16)
  {
    /*-----------------------------------------------------------------*/
    /* If we've reached the maximum UTF16 length, out of space.        */
    /*-----------------------------------------------------------------*/
    if (utf16_i >= max_utf16_len)
    {
      set_errno(ENOSPC);
      return (ui32)-1;
    }

    /*-----------------------------------------------------------------*/
    /* Lookup the number of bytes for current code encoding.           */
    /*-----------------------------------------------------------------*/
    code_len = LengthLookup[(byte1 & 0xF0) >> 4];
    if (code_len == 0)
    {
      set_errno(EINVAL);
      return (ui32)-1;
    }
    if (code_len > utf8_len)
    {
      set_errno(EINVAL);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -