utf82u.c

来自「utf-8和unicode的互转的c代码」· C语言代码 · 共 145 行

145 行

#include <stdio.h>

intutf82u(char *str, int * chPtr)/* str is the UTF8 next character pointer *//* chPtr is the int for the result */{  int byte;  char *p;  /* HTML4.0 entities in decimal form, e.g. &#197; */  /*           or in hexadecimal form, e.g. &#x6C34; */  byte = *((unsigned char *) str);  if (byte == '&')    {      int i, n = 0;      byte = *((unsigned char *) (str + 1));      if (byte == '#')	{          byte = *((unsigned char *) (str + 2));          if (byte == 'x' || byte == 'X')            {              for (i = 3; i < 8; i++)                {                  byte = *((unsigned char *) (str + i));                  if (byte >= 'A' && byte <= 'F')                    byte = byte - 'A' + 10;                  else if (byte >= 'a' && byte <= 'f')                    byte = byte - 'a' + 10;                  else if (byte >= '0' && byte <= '9')                    byte = byte - '0';                  else                    break;                  n = (n * 16) + byte;                }            }          else            {	      for (i = 2; i < 8; i++)	        {	          byte = *((unsigned char *) (str + i));	          if (byte >= '0' && byte <= '9')	            n = (n * 10) + (byte - '0');	          else		    break;		}	    }	  if (byte == ';')	    {	      *chPtr = (int) n;	      return ++i;	    }	}      else        {		/*fix me*/
		*chPtr = 0;
		return 1;        }    }  /*   * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.   */  byte = *((unsigned char *) str);  if (byte < 0xC0)    {      /*       * Handles properly formed UTF-8 characters between       * 0x01 and 0x7F.  Also treats \0 and naked trail       * bytes 0x80 to 0xBF as valid characters representing       * themselves.       */      *chPtr = (int) byte;      return 1;    }  else if (byte < 0xE0)    {      if ((str[1] & 0xC0) == 0x80)	{	  /*	   * Two-byte-character lead-byte followed	   * by a trail-byte.	   */	  *chPtr = (int) (((byte & 0x1F) << 6) | (str[1] & 0x3F));	  return 2;	}      /*       * A two-byte-character lead-byte not followed by trail-byte       * represents itself.       */      *chPtr = (int) byte;      return 1;    }  else if (byte < 0xF0)    {      if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80))	{	  /*	   * Three-byte-character lead byte followed by	   * two trail bytes.	   */	  *chPtr = (int) (((byte & 0x0F) << 12)				  | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));	  return 3;	}      /*       * A three-byte-character lead-byte not followed by       * two trail-bytes represents itself.       */      *chPtr = (int) byte;      return 1;    }  *chPtr = (int) byte;  return 1;}



int main()
{
	char str[] = {0xe8, 0x87, 0xba, 0xe7, 0x81, 0xa3, 0x00};	char *next = str;
	int uni;
	int len;	int i;

	for (i=0; *next; i++)	{		len = utf82u(next, &uni);		next += len;
		printf("%d --0x%x\n", len, uni);

	}}

utf82u.c - 源码说明

本页面展示了「utf-8和unicode的互转的c代码」中的 utf82u.c 源码文件，采用 C语言编程语言编写，共 145 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与unicode相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?