📄 utf82u.c
字号:
#include <stdio.h>
intutf82u(char *str, int * chPtr)/* str is the UTF8 next character pointer *//* chPtr is the int for the result */{ int byte; char *p; /* HTML4.0 entities in decimal form, e.g. Å */ /* or in hexadecimal form, e.g. 水 */ byte = *((unsigned char *) str); if (byte == '&') { int i, n = 0; byte = *((unsigned char *) (str + 1)); if (byte == '#') { byte = *((unsigned char *) (str + 2)); if (byte == 'x' || byte == 'X') { for (i = 3; i < 8; i++) { byte = *((unsigned char *) (str + i)); if (byte >= 'A' && byte <= 'F') byte = byte - 'A' + 10; else if (byte >= 'a' && byte <= 'f') byte = byte - 'a' + 10; else if (byte >= '0' && byte <= '9') byte = byte - '0'; else break; n = (n * 16) + byte; } } else { for (i = 2; i < 8; i++) { byte = *((unsigned char *) (str + i)); if (byte >= '0' && byte <= '9') n = (n * 10) + (byte - '0'); else break; } } if (byte == ';') { *chPtr = (int) n; return ++i; } } else { /*fix me*/
*chPtr = 0;
return 1; } } /* * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. */ byte = *((unsigned char *) str); if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between * 0x01 and 0x7F. Also treats \0 and naked trail * bytes 0x80 to 0xBF as valid characters representing * themselves. */ *chPtr = (int) byte; return 1; } else if (byte < 0xE0) { if ((str[1] & 0xC0) == 0x80) { /* * Two-byte-character lead-byte followed * by a trail-byte. */ *chPtr = (int) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); return 2; } /* * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ *chPtr = (int) byte; return 1; } else if (byte < 0xF0) { if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) { /* * Three-byte-character lead byte followed by * two trail bytes. */ *chPtr = (int) (((byte & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); return 3; } /* * A three-byte-character lead-byte not followed by * two trail-bytes represents itself. */ *chPtr = (int) byte; return 1; } *chPtr = (int) byte; return 1;}
int main()
{
char str[] = {0xe8, 0x87, 0xba, 0xe7, 0x81, 0xa3, 0x00}; char *next = str;
int uni;
int len; int i;
for (i=0; *next; i++) { len = utf82u(next, &uni); next += len;
printf("%d --0x%x\n", len, uni);
}}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -