📄 utf8.c
字号:
{ unsigned long i; if (!script) return (SCRIPT *) &utf8_scvalid[0]; else if (*script && (strlen (script) < 128)) for (i = 0; utf8_scvalid[i].name; i++) if (!compare_cstring (script,utf8_scvalid[i].name)) return (SCRIPT *) &utf8_scvalid[i]; return NIL; /* failed */}/* Look up charset name or return entire table * Accepts: charset name or NIL * Returns: charset table entry or NIL if unknown */const CHARSET *utf8_charset (char *charset){ unsigned long i; if (!charset) return (CHARSET *) &utf8_csvalid[0]; else if (*charset && (strlen (charset) < 128)) for (i = 0; utf8_csvalid[i].name; i++) if (!compare_cstring (charset,utf8_csvalid[i].name)) return (CHARSET *) &utf8_csvalid[i]; return NIL; /* failed */}/* Validate charset and generate error message if invalid * Accepts: bad character set * Returns: NIL if good charset, else error message string */#define BADCSS "[BADCHARSET ("#define BADCSE ")] Unknown charset: "char *utf8_badcharset (char *charset){ char *msg = NIL; if (!utf8_charset (charset)) { char *s,*t; unsigned long i,j; /* calculate size of header, trailer, and bad * charset plus charset names */ for (i = 0, j = sizeof (BADCSS) + sizeof (BADCSE) + strlen (charset) - 2; utf8_csvalid[i].name; i++) j += strlen (utf8_csvalid[i].name) + 1; /* not built right */ if (!i) fatal ("No valid charsets!"); /* header */ for (s = msg = (char *) fs_get (j), t = BADCSS; *t; *s++ = *t++); /* each charset */ for (i = 0; utf8_csvalid[i].name; *s++ = ' ', i++) for (t = utf8_csvalid[i].name; *t; *s++ = *t++); /* back over last space, trailer */ for (t = BADCSE, --s; *t; *s++ = *t++); /* finally bogus charset */ for (t = charset; *t; *s++ = *t++); *s++ = '\0'; /* finally tie off string */ if (s != (msg + j)) fatal ("charset msg botch"); } return msg;}/* Convert charset labelled sized text to UTF-8 * Accepts: source sized text * charset * pointer to returned sized text if non-NIL * flags * Returns: T if successful, NIL if failure */long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags){ ucs4cn_t cv = (flags & U8T_CASECANON) ? ucs4_titlecase : NIL; ucs4de_t de = (flags & U8T_DECOMPOSE) ? ucs4_decompose_recursive : NIL; const CHARSET *cs = (charset && *charset) ? utf8_charset (charset) : utf8_infercharset (text); if (cs) return (text && ret) ? utf8_text_cs (text,cs,ret,cv,de) : LONGT; if (ret) { /* no conversion possible */ ret->data = text->data; /* so return source */ ret->size = text->size; } return NIL; /* failure */}/* Operations used in converting data */#define UTF8_COUNT_BMP(count,c,cv,de) { \ void *more = NIL; \ if (cv) c = (*cv) (c); \ if (de) c = (*de) (c,&more); \ do count += UTF8_SIZE_BMP(c); \ while (more && (c = (*de) (U8G_ERROR,&more)));\}#define UTF8_WRITE_BMP(b,c,cv,de) { \ void *more = NIL; \ if (cv) c = (*cv) (c); \ if (de) c = (*de) (c,&more); \ do UTF8_PUT_BMP (b,c) \ while (more && (c = (*de) (U8G_ERROR,&more)));\}#define UTF8_COUNT(count,c,cv,de) { \ void *more = NIL; \ if (cv) c = (*cv) (c); \ if (de) c = (*de) (c,&more); \ do count += utf8_size (c); \ while (more && (c = (*de) (U8G_ERROR,&more)));\}#define UTF8_WRITE(b,c,cv,de) { \ void *more = NIL; \ if (cv) c = (*cv) (c); \ if (de) c = (*de) (c,&more); \ do b = utf8_put (b,c); \ while (more && (c = (*de) (U8G_ERROR,&more)));\}/* Convert sized text to UTF-8 given CHARSET block * Accepts: source sized text * CHARSET block * pointer to returned sized text * canonicalization function * decomposition function * Returns: T if successful, NIL if failure */long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret, ucs4cn_t cv,ucs4de_t de){ ret->data = text->data; /* default to source */ ret->size = text->size; switch (cs->type) { /* convert if type known */ case CT_ASCII: /* 7-bit ASCII no table */ case CT_UTF8: /* variable UTF-8 encoded Unicode no table */ if (cv || de) utf8_text_utf8 (text,ret,cv,de); break; case CT_1BYTE0: /* 1 byte no table */ utf8_text_1byte0 (text,ret,cv,de); break; case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ utf8_text_1byte (text,ret,cs->tab,cv,de); break; case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ utf8_text_1byte8 (text,ret,cs->tab,cv,de); break; case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ utf8_text_euc (text,ret,cs->tab,cv,de); break; case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ utf8_text_dbyte (text,ret,cs->tab,cv,de); break; case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ utf8_text_dbyte2 (text,ret,cs->tab,cv,de); break; case CT_UTF7: /* variable UTF-7 encoded Unicode no table */ utf8_text_utf7 (text,ret,cv,de); break; case CT_UCS2: /* 2 byte 16-bit Unicode no table */ utf8_text_ucs2 (text,ret,cv,de); break; case CT_UCS4: /* 4 byte 32-bit Unicode no table */ utf8_text_ucs4 (text,ret,cv,de); break; case CT_UTF16: /* variable UTF-16 encoded Unicode no table */ utf8_text_utf16 (text,ret,cv,de); break; case CT_2022: /* variable ISO-2022 encoded no table*/ utf8_text_2022 (text,ret,cv,de); break; case CT_SJIS: /* 2 byte Shift-JIS encoded JIS no table */ utf8_text_sjis (text,ret,cv,de); break; default: /* unknown character set type */ return NIL; } return LONGT; /* return success */}/* Reverse mapping routines * * These routines only support character sets, not all possible charsets. In * particular, they do not support any Unicode encodings or ISO 2022. * * As a special dispensation, utf8_cstext() and utf8_cstocstext() support * support ISO-2022-JP if EUC-JP can be reverse mapped; and utf8_rmaptext() * will generated ISO-2022-JP using an EUC-JP rmap if flagged to do so. * * No attempt is made to map "equivalent" Unicode characters or Unicode * characters that have the same glyph; nor is there any attempt to handle * combining characters or otherwise do any stringprep. Maybe later. *//* Convert UTF-8 sized text to charset * Accepts: source sized text * destination charset * pointer to returned sized text * substitute character if not in cs, else NIL to return failure * Returns: T if successful, NIL if failure */long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret, unsigned long errch){ short iso2022jp = !compare_cstring (charset,"ISO-2022-JP"); unsigned short *rmap = utf8_rmap (iso2022jp ? "EUC-JP" : charset); return rmap ? utf8_rmaptext (text,rmap,ret,errch,iso2022jp) : NIL;}/* Convert charset labelled sized text to another charset * Accepts: source sized text * source charset * pointer to returned sized text * destination charset * substitute character if not in dest cs, else NIL to return failure * Returns: T if successful, NIL if failure * * This routine has the same restricts as utf8_cstext(). */long utf8_cstocstext (SIZEDTEXT *src,char *sc,SIZEDTEXT *dst,char *dc, unsigned long errch){ SIZEDTEXT utf8; const CHARSET *scs,*dcs; unsigned short *rmap; long ret = NIL; long iso2022jp; /* lookup charsets and reverse map */ if ((dc && (dcs = utf8_charset (dc))) && (rmap = (iso2022jp = ((dcs->type == CT_2022) && !compare_cstring (dcs->name,"ISO-2022-JP"))) ? utf8_rmap ("EUC-JP") : utf8_rmap_cs (dcs)) && (scs = (sc && *sc) ? utf8_charset (sc) : utf8_infercharset (src))) { /* init temporary buffer */ memset (&utf8,NIL,sizeof (SIZEDTEXT)); /* source cs equivalent to dest cs? */ if ((scs->type == dcs->type) && (scs->tab == dcs->tab)) { dst->data = src->data; /* yes, just copy pointers */ dst->size = src->size; ret = LONGT; } /* otherwise do the conversion */ else ret = (utf8_text_cs (src,scs,&utf8,NIL,NIL) && utf8_rmaptext (&utf8,rmap,dst,errch,iso2022jp)); /* flush temporary buffer */ if (utf8.data && (utf8.data != src->data) && (utf8.data != dst->data)) fs_give ((void **) &utf8.data); } return ret;}/* Cached rmap */static const CHARSET *currmapcs = NIL;static unsigned short *currmap = NIL;/* Cache and return map for UTF-8 -> character set * Accepts: character set name * Returns: cached map if character set found, else NIL */unsigned short *utf8_rmap (char *charset){ return (currmapcs && !compare_cstring (charset,currmapcs->name)) ? currmap : utf8_rmap_cs (utf8_charset (charset));}/* Cache and return map for UTF-8 -> character set given CHARSET block * Accepts: CHARSET block * Returns: cached map if character set found, else NIL */unsigned short *utf8_rmap_cs (const CHARSET *cs){ unsigned short *ret = NIL; if (!cs); /* have charset? */ else if (cs == currmapcs) ret = currmap; else if (ret = utf8_rmap_gen (cs,currmap)) { currmapcs = cs; currmap = ret; } return ret;}/* Return map for UTF-8 -> character set given CHARSET block * Accepts: CHARSET block * old map to recycle * Returns: map if character set found, else NIL */unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap){ unsigned short u,*tab,*rmap; unsigned int i,m,ku,ten; struct utf8_eucparam *param,*p2; switch (cs->type) { /* is a character set? */ case CT_ASCII: /* 7-bit ASCII no table */ case CT_1BYTE0: /* 1 byte no table */ case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ case CT_SJIS: /* 2 byte Shift-JIS */ rmap = oldmap ? oldmap : /* recycle old map if supplied else make new */ (unsigned short *) fs_get (65536 * sizeof (unsigned short)); /* initialize table for ASCII */ for (i = 0; i < 128; i++) rmap[i] = (unsigned short) i; /* populate remainder of table with NOCHAR */#define NOCHARBYTE (NOCHAR & 0xff)#if NOCHAR - ((NOCHARBYTE << 8) | NOCHARBYTE) while (i < 65536) rmap[i++] = NOCHAR;#else memset (rmap + 128,NOCHARBYTE,(65536 - 128) * sizeof (unsigned short));#endif break; default: /* unsupported charset type */ rmap = NIL; /* no map possible */ } if (rmap) { /* have a map? */ switch (cs->type) { /* additional reverse map actions */ case CT_1BYTE0: /* 1 byte no table */ for (i = 128; i < 256; i++) rmap[i] = (unsigned short) i; break; case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++) if (tab[i & BITS7] != UBOGON) rmap[tab[i & BITS7]] = (unsigned short)i; break; case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++) if (tab[i] != UBOGON) rmap[tab[i]] = (unsigned short) i; break; case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ for (param = (struct utf8_eucparam *) cs->tab, tab = (unsigned short *) param->tab, ku = 0; ku < param->max_ku; ku++) for (ten = 0; ten < param->max_ten; ten++) if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON) rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten) + 0x8080; break; case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ for (param = (struct utf8_eucparam *) cs->tab, tab = (unsigned short *) param->tab, ku = 0; ku < param->max_ku; ku++) for (ten = 0; ten < param->max_ten; ten++) if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON) rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten); break; case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ param = (struct utf8_eucparam *) cs->tab; p2 = param + 1; /* plane 2 parameters */ /* only ten parameters should differ */ if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku)) fatal ("ku definition error for CT_DBYTE2 charset"); /* total codepoints in each ku */ m = param->max_ten + p2->max_ten; tab = (unsigned short *) param->tab; for (ku = 0; ku < param->max_ku; ku++) { for (ten = 0; ten < param->max_ten; ten++) if ((u = tab[(ku * m) + ten]) != UBOGON) rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten); for (ten = 0; ten < p2->max_ten; ten++) if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON) rmap[u] = ((ku + param->base_ku) << 8) + (ten + p2->base_ten); } break; case CT_SJIS: /* 2 byte Shift-JIS */ for (ku = 0; ku < MAX_JIS0208_KU; ku++)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -