📄 utf8.c
字号:
default: if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) || ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten)) return U8G_NOTUTF8; ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten]; /* special hack for JIS X 0212: merge rows less than 10 */ if ((ret == UBOGON) && ku && (ku < 10) && p3->tab && p3->base_ten) ret = ((unsigned short *) p3->tab) [((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten]; break; } } else ret = c; /* ASCII character */ break; case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ if (c & BIT8) { /* double-byte character? */ p1 = (struct utf8_eucparam *) cs->tab; if (j--) c1 = *t++; /* get second octet */ else return U8G_ENDSTRI; if (((ku = c - p1->base_ku) < p1->max_ku) && ((ten = c1 - p1->base_ten) < p1->max_ten)) ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten]; else return U8G_NOTUTF8; } else ret = c; /* ASCII character */ break; case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ if (c & BIT8) { /* double-byte character? */ p1 = (struct utf8_eucparam *) cs->tab; p2 = p1 + 1; if (j--) c1 = *t++; /* get second octet */ else return U8G_ENDSTRI; if (c1 & BIT8) { /* high vs. low plane */ if ((ku = c - p2->base_ku) < p2->max_ku && ((ten = c1 - p2->base_ten) < p2->max_ten)) ret = ((unsigned short *) p1->tab) [(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten]; else return U8G_NOTUTF8; } else if ((ku = c - p1->base_ku) < p1->max_ku && ((ten = c1 - p1->base_ten) < p1->max_ten)) ret = ((unsigned short *) p1->tab) [(ku*(p1->max_ten + p2->max_ten)) + ten]; else return U8G_NOTUTF8; } else ret = c; /* ASCII character */ break; case CT_SJIS: /* 2 byte Shift-JIS encoded JIS no table */ /* compromise - do yen sign but not overline */ if (!(c & BIT8)) ret = (c == JISROMAN_YEN) ? UCS2_YEN : c; /* half-width katakana? */ else if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) ret = c + KANA_8; else { /* Shift-JIS */ if (j--) c1 = *t++; /* get second octet */ else return U8G_ENDSTRI; SJISTOJIS (c,c1); c = JISTOUNICODE (c,c1,ku,ten); } break; case CT_UCS2: /* 2 byte 16-bit Unicode no table */ ret = c << 8; if (j--) c = *t++; /* get second octet */ else return U8G_ENDSTRI; /* empty string */ ret |= c; break; case CT_UCS4: /* 4 byte 32-bit Unicode no table */ if (c & 0x80) return U8G_NOTUTF8; if (j < 3) return U8G_ENDSTRI; j -= 3; /* count three octets */ ret = c << 24; ret |= (*t++) << 16; ret |= (*t++) << 8; ret |= (*t++); break; case CT_UTF16: /* variable UTF-16 encoded Unicode no table */ ret = c << 8; if (j--) c = *t++; /* get second octet */ else return U8G_ENDSTRI; /* empty string */ ret |= c; /* surrogate? */ if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) { /* invalid first surrogate */ if ((ret > UTF16_SURRHEND) || (j < 2)) return U8G_NOTUTF8; j -= 2; /* count two octets */ d = (*t++) << 8; /* first octet of second surrogate */ d |= *t++; /* second octet of second surrogate */ if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) return U8G_NOTUTF8; ret = UTF16_BASE + ((ret & UTF16_MASK) << UTF16_SHIFT) + (d & UTF16_MASK); } break; default: /* unknown/unsupported character set type */ return U8G_NOTUTF8; } *s = t; /* update pointer and counter */ *i = j; return ret;}/* Produce charset validity map for BMP * Accepts: list of charsets to map * Returns: validity map, indexed by BMP codepoint * * Bit 0x1 is the "not-CJK" character bit */unsigned long *utf8_csvalidmap (char *charsets[]){ unsigned short u,*tab; unsigned int m,ku,ten; unsigned long i,csi,csb; struct utf8_eucparam *param,*p2; char *s; const CHARSET *cs; unsigned long *ret = (unsigned long *) fs_get (i = 0x10000 * sizeof (unsigned long)); memset (ret,0,i); /* zero the entire vector */ /* mark all the non-CJK codepoints */ /* U+0000 - U+2E7F non-CJK */ for (i = 0; i < 0x2E7F; ++i) ret[i] = 0x1; /* U+2E80 - U+2EFF CJK Radicals Supplement * U+2F00 - U+2FDF Kangxi Radicals * U+2FE0 - U+2FEF unassigned * U+2FF0 - U+2FFF Ideographic Description Characters * U+3000 - U+303F CJK Symbols and Punctuation * U+3040 - U+309F Hiragana * U+30A0 - U+30FF Katakana * U+3100 - U+312F BoPoMoFo * U+3130 - U+318F Hangul Compatibility Jamo * U+3190 - U+319F Kanbun * U+31A0 - U+31BF BoPoMoFo Extended * U+31C0 - U+31EF CJK Strokes * U+31F0 - U+31FF Katakana Phonetic Extensions * U+3200 - U+32FF Enclosed CJK Letters and Months * U+3300 - U+33FF CJK Compatibility * U+3400 - U+4DBF CJK Unified Ideographs Extension A * U+4DC0 - U+4DFF Yijing Hexagram Symbols * U+4E00 - U+9FFF CJK Unified Ideographs * U+A000 - U+A48F Yi Syllables * U+A490 - U+A4CF Yi Radicals * U+A700 - U+A71F Modifier Tone Letters */ for (i = 0xa720; i < 0xabff; ++i) ret[i] = 0x1; /* U+AC00 - U+D7FF Hangul Syllables */ for (i = 0xd800; i < 0xf8ff; ++i) ret[i] = 0x1; /* U+F900 - U+FAFF CJK Compatibility Ideographs */ for (i = 0xfb00; i < 0xfe2f; ++i) ret[i] = 0x1; /* U+FE30 - U+FE4F CJK Compatibility Forms * U+FE50 - U+FE6F Small Form Variants (for CNS 11643) */ for (i = 0xfe70; i < 0xfeff; ++i) ret[i] = 0x1; /* U+FF00 - U+FFEF CJK Compatibility Ideographs */ for (i = 0xfff0; i < 0x10000; ++i) ret[i] = 0x1; /* for each supplied charset */ for (csi = 1; ret && charsets && (s = charsets[csi - 1]); ++csi) { /* substitute EUC-JP for ISO-2022-JP */ if (!compare_cstring (s,"ISO-2022-JP")) s = "EUC-JP"; /* look up charset */ if (cs = utf8_charset (s)) { csb = 1 << csi; /* charset bit */ switch (cs->type) { case CT_ASCII: /* 7-bit ASCII no table */ case CT_1BYTE0: /* 1 byte no table */ case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ case CT_SJIS: /* 2 byte Shift-JIS */ /* supported charset type, all ASCII is OK */ for (i = 0; i < 128; ++i) ret[i] |= csb; break; default: /* unsupported charset type */ fs_give ((void **) &ret); break; } /* now do additional operations */ if (ret) switch (cs->type) { case CT_1BYTE0: /* 1 byte no table */ for (i = 128; i < 256; i++) ret[i] |= csb; break; case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++) if (tab[i & BITS7] != UBOGON) ret[tab[i & BITS7]] |= csb; break; case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++) if (tab[i] != UBOGON) ret[tab[i]] |= csb; break; case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ for (param = (struct utf8_eucparam *) cs->tab, tab = (unsigned short *) param->tab, ku = 0; ku < param->max_ku; ku++) for (ten = 0; ten < param->max_ten; ten++) if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON) ret[u] |= csb; break; case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ for (param = (struct utf8_eucparam *) cs->tab, tab = (unsigned short *) param->tab, ku = 0; ku < param->max_ku; ku++) for (ten = 0; ten < param->max_ten; ten++) if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON) ret[u] |= csb; break; case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ param = (struct utf8_eucparam *) cs->tab; p2 = param + 1; /* plane 2 parameters */ /* only ten parameters should differ */ if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku)) fatal ("ku definition error for CT_DBYTE2 charset"); /* total codepoints in each ku */ m = param->max_ten + p2->max_ten; tab = (unsigned short *) param->tab; for (ku = 0; ku < param->max_ku; ku++) { for (ten = 0; ten < param->max_ten; ten++) if ((u = tab[(ku * m) + ten]) != UBOGON) ret[u] |= csb; for (ten = 0; ten < p2->max_ten; ten++) if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON) ret[u] |= csb; } break; case CT_SJIS: /* 2 byte Shift-JIS */ for (ku = 0; ku < MAX_JIS0208_KU; ku++) for (ten = 0; ten < MAX_JIS0208_TEN; ten++) if ((u = jis0208tab[ku][ten]) != UBOGON) ret[u] |= csb; /* JIS hankaku katakana */ for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++) ret[UCS2_KATAKANA + u] |= csb; break; } } /* invalid charset, punt */ else fs_give ((void **) &ret); } return ret;}/* Infer charset from unlabelled sized text * Accepts: sized text * Returns: charset if one inferred, or NIL if unknown */const CHARSET *utf8_infercharset (SIZEDTEXT *src){ long iso2022jp = NIL; long eightbit = NIL; unsigned long i; /* look for ISO 2022 */ if (src) for (i = 0; i < src->size; i++) { /* ESC sequence? */ if ((src->data[i] == I2C_ESC) && (++i < src->size)) switch (src->data[i]) { case I2C_MULTI: /* yes, multibyte? */ if (++i < src->size) switch (src->data[i]) { case I2CS_94x94_JIS_OLD: /* JIS X 0208-1978 */ case I2CS_94x94_JIS_NEW: /* JIS X 0208-1983 */ case I2CS_94x94_JIS_EXT: /* JIS X 0212-1990 (kludge...) */ iso2022jp = T; /* found an ISO-2022-JP sequence */ break; default: /* other multibyte */ return NIL; /* definitely invalid */ } break; case I2C_G0_94: /* single byte */ if (++i < src->size) switch (src->data[i]) { case I2CS_94_JIS_BUGROM: /* in case old buggy software */ case I2CS_94_JIS_ROMAN: /* JIS X 0201-1976 left half */ case I2CS_94_ASCII: /* ASCII */ case I2CS_94_BRITISH: /* good enough for gov't work */ break; default: /* other 94 single byte */ return NIL; /* definitely invalid */ } } /* if possible UTF-8 and not ISO-2022-JP */ else if (!iso2022jp && (eightbit >= 0) && (src->data[i] & BIT8) && (eightbit = utf8_validate (src->data + i,src->size - i)) > 0) i += eightbit - 1; /* skip past all but last of UTF-8 char */ } /* ISO-2022-JP overrides other guesses */ if (iso2022jp) return utf8_charset ("ISO-2022-JP"); if (eightbit > 0) return utf8_charset ("UTF-8"); return eightbit ? NIL : utf8_charset ("US-ASCII");}/* Validate that character at this position is UTF-8 * Accepts: string pointer * size of remaining string * Returns: size of UTF-8 character in octets or -1 if not UTF-8 */long utf8_validate (unsigned char *s,unsigned long i){ unsigned long j = i; return (utf8_get (&s,&i) & U8G_ERROR) ? -1 : j - i;}/* Convert ISO 8859-1 to UTF-8 * Accepts: source sized text * pointer to return sized text * canonicalization function */void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de){ unsigned long i; unsigned char *s; unsigned int c; for (ret->size = i = 0; i < text->size;) { c = text->data[i++]; UTF8_COUNT_BMP (ret->size,c,cv,de) } (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL; for (i = 0; i < text->size;) { c = text->data[i++]; UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ }}/* Convert single byte ASCII+8bit character set sized text to UTF-8 * Accepts: source sized text * pointer to return sized text * conversion table * canonicalization function */void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, ucs4de_t de){ unsigned long i; unsigned char *s; unsigned int c; unsigned short *tbl = (unsigned short *) tab; for (ret->size = i = 0; i < text->size;) { if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7]; UTF8_COUNT_BMP (ret->size,c,cv,de) } (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL; for (i = 0; i < text->size;) { if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7]; UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ }}/* Convert single byte 8bit character set sized text to UTF-8 * Accepts: source sized text * pointer to return sized text * conversion table * canonicalization function */void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, ucs4de_t de){ unsigned long i; unsigned char *s; unsigned int c; unsigned short *tbl = (unsigned short *) tab; for (ret->size = i = 0; i < text->size;) { c = tbl[text->data[i++]]; UTF8_COUNT_BMP (ret->size,c,cv,de) } (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL; for (i = 0; i < text->size;) { c = tbl[text->data[i++]]; UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ }}/* Convert EUC sized text to UTF-8
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -