📄 utf8.c
字号:
for (ten = 0; ten < MAX_JIS0208_TEN; ten++) if ((u = jis0208tab[ku][ten]) != UBOGON) { int sku = ku + BASE_JIS0208_KU; int sten = ten + BASE_JIS0208_TEN; rmap[u] = ((((sku + 1) >> 1) + ((sku < 95) ? 112 : 176)) << 8) + sten + ((sku % 2) ? ((sten > 95) ? 32 : 31) : 126); } /* JIS Roman */ rmap[UCS2_YEN] = JISROMAN_YEN; rmap[UCS2_OVERLINE] = JISROMAN_OVERLINE; /* JIS hankaku katakana */ for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++) rmap[UCS2_KATAKANA + u] = MIN_KANA_8 + u; break; } /* hack: map NBSP to SP if otherwise no map */ if (rmap[0x00a0] == NOCHAR) rmap[0x00a0] = rmap[0x0020]; } return rmap; /* return map */}/* Convert UTF-8 sized text to charset using rmap * Accepts: source sized text * conversion rmap * pointer to returned sized text * substitute character if not in rmap, else NIL to return failure * ISO-2022-JP conversion flag * Returns T if successful, NIL if failure * * This routine doesn't try to convert to all possible charsets; in particular * it doesn't support other Unicode encodings or any ISO 2022 other than * ISO-2022-JP. */long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret, unsigned long errch,long iso2022jp){ unsigned long i,u,c; /* get size of buffer */ if (i = utf8_rmapsize (text,rmap,errch,iso2022jp)) { unsigned char *s = text->data; unsigned char *t = ret->data = (unsigned char *) fs_get (i); ret->size = i - 1; /* number of octets in destination buffer */ /* start non-zero ISO-2022-JP state at 1 */ if (iso2022jp) iso2022jp = 1; /* convert string, ignore BOM */ for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) { /* substitute error character for NOCHAR */ if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch; switch (iso2022jp) { /* depends upon ISO 2022 mode */ case 0: /* ISO 2022 not in effect */ /* two-byte character */ if (c > 0xff) *t++ = (unsigned char) (c >> 8); /* single-byte or low-byte of two-byte */ *t++ = (unsigned char) (c & 0xff); break; case 1: /* ISO 2022 Roman */ /* <ch> */ if (c < 0x80) *t++ = (unsigned char) c; else { /* JIS character */ *t++ = I2C_ESC; /* ESC $ B <hi> <lo> */ *t++ = I2C_MULTI; *t++ = I2CS_94x94_JIS_NEW; *t++ = (unsigned char) (c >> 8) & 0x7f; *t++ = (unsigned char) c & 0x7f; iso2022jp = 2; /* shift to ISO 2022 JIS */ } break; case 2: /* ISO 2022 JIS */ if (c > 0x7f) { /* <hi> <lo> */ *t++ = (unsigned char) (c >> 8) & 0x7f; *t++ = (unsigned char) c & 0x7f; } else { /* ASCII character */ *t++ = I2C_ESC; /* ESC ( J <ch> */ *t++ = I2C_G0_94; *t++ = I2CS_94_JIS_ROMAN; *t++ = (unsigned char) c; iso2022jp = 1; /* shift to ISO 2022 Roman */ } break; } } if (iso2022jp == 2) { /* ISO-2022-JP string must end in Roman */ *t++ = I2C_ESC; /* ESC ( J */ *t++ = I2C_G0_94; *t++ = I2CS_94_JIS_ROMAN; } *t++ = NIL; /* tie off returned data */ return LONGT; /* return success */ } ret->data = NIL; ret->size = 0; return NIL; /* failure */}/* Calculate size of convertsion of UTF-8 sized text to charset using rmap * Accepts: source sized text * conversion rmap * pointer to returned sized text * substitute character if not in rmap, else NIL to return failure * ISO-2022-JP conversion flag * Returns size+1 if successful, NIL if failure * * This routine doesn't try to handle to all possible charsets; in particular * it doesn't support other Unicode encodings or any ISO 2022 other than * ISO-2022-JP. */unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap, unsigned long errch,long iso2022jp){ unsigned long i,u,c; unsigned long ret = 1; /* terminating NUL */ unsigned char *s = text->data; if (iso2022jp) iso2022jp = 1; /* start non-zero ISO-2022-JP state at 1 */ for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) { if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch))) return NIL; /* not in BMP, or NOCHAR and no err char */ switch (iso2022jp) { /* depends upon ISO 2022 mode */ case 0: /* ISO 2022 not in effect */ ret += (c > 0xff) ? 2 : 1; break; case 1: /* ISO 2022 Roman */ if (c < 0x80) ret += 1; /* <ch> */ else { /* JIS character */ ret += 5; /* ESC $ B <hi> <lo> */ iso2022jp = 2; /* shift to ISO 2022 JIS */ } break; case 2: /* ISO 2022 JIS */ if (c > 0x7f) ret += 2; /* <hi> <lo> */ else { /* ASCII character */ ret += 4; /* ESC ( J <ch> */ iso2022jp = 1; /* shift to ISO 2022 Roman */ } break; } } if (iso2022jp == 2) { /* ISO-2022-JP string must end in Roman */ ret += 3; /* ESC ( J */ iso2022jp = 1; /* reset state to Roman */ } return ret;}/* Convert UCS-4 to charset using rmap * Accepts: source UCS-4 character(s) * numver of UCS-4 characters * conversion rmap * pointer to returned sized text * substitute character if not in rmap, else NIL to return failure * Returns T if successful, NIL if failure * * Currently only supports BMP characters, and does not support ISO-2022 */long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap, SIZEDTEXT *ret,unsigned long errch){ long size = ucs4_rmaplen (ucs4,len,rmap,errch); return (size >= 0) ? /* build in newly-created buffer */ ucs4_rmapbuf (ret->data = (unsigned char *) fs_get ((ret->size = size) +1), ucs4,len,rmap,errch) : NIL;}/* Return size of UCS-4 string converted to other CS via rmap * Accepts: source UCS-4 character(s) * numver of UCS-4 characters * conversion rmap * substitute character if not in rmap, else NIL to return failure * Returns: length if success, negative if failure (no-convert) */long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap, unsigned long errch){ long ret; unsigned long i,u,c; /* count non-BOM characters */ for (ret = 0,i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) { if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch))) return -1; /* not in BMP, or NOCHAR and no err char? */ ret += (c > 0xff) ? 2 : 1; } return ret;}/* Stuff buffer with UCS-4 string converted to other CS via rmap * Accepts: destination buffer * source UCS-4 character(s) * number of UCS-4 characters * conversion rmap * substitute character if not in rmap, else NIL to return failure * Returns: T, always */long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len, unsigned short *rmap,unsigned long errch){ unsigned long i,u,c; /* convert non-BOM characters */ for (i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) { /* substitute error character for NOCHAR */ if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch; /* two-byte character? */ if (c > 0xff) *t++ = (unsigned char) (c >> 8); /* single-byte or low-byte of two-byte */ *t++ = (unsigned char) (c & 0xff); } *t++ = NIL; /* tie off returned data */ return LONGT;}/* Return UCS-4 Unicode character from UTF-8 string * Accepts: pointer to string * remaining octets in string * Returns: UCS-4 character with pointer and count updated * or error code with pointer and count unchanged */unsigned long utf8_get (unsigned char **s,unsigned long *i){ unsigned char *t = *s; unsigned long j = *i; /* decode raw UTF-8 string */ unsigned long ret = utf8_get_raw (&t,&j); if (ret & U8G_ERROR); /* invalid raw UTF-8 decoding? */ /* no, is it surrogate? */ else if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) ret = U8G_SURROGA; /* or in non-Unicode ISO 10646 space? */ else if (ret > UCS4_MAXUNICODE) ret = U8G_NOTUNIC; else { *s = t; /* all is well, update pointer */ *i = j; /* and counter */ } return ret; /* return value */}/* Return raw (including non-Unicode) UCS-4 character from UTF-8 string * Accepts: pointer to string * remaining octets in string * Returns: UCS-4 character with pointer and count updated * or error code with pointer and count unchanged */unsigned long utf8_get_raw (unsigned char **s,unsigned long *i){ unsigned char c,c1; unsigned char *t = *s; unsigned long j = *i; unsigned long ret = U8G_NOTUTF8; int more = 0; do { /* make sure have source octets available */ if (!j--) return more ? U8G_ENDSTRI : U8G_ENDSTRG; /* UTF-8 continuation? */ else if (((c = *t++) > 0x7f) && (c < 0xc0)) { /* continuation when not in progress */ if (!more) return U8G_BADCONT; --more; /* found a continuation octet */ ret <<= 6; /* shift current value by 6 bits */ ret |= c & 0x3f; /* merge continuation octet */ } /* incomplete UTF-8 character */ else if (more) return U8G_INCMPLT; else { /* start of sequence */ c1 = j ? *t : 0xbf; /* assume valid continuation if incomplete */ if (c < 0x80) ret = c; /* U+0000 - U+007f */ else if (c < 0xc2); /* c0 and c1 never valid */ else if (c < 0xe0) { /* U+0080 - U+07ff */ if (c &= 0x1f) more = 1; } else if (c < 0xf0) { /* U+0800 - U+ffff */ if ((c &= 0x0f) || (c1 >= 0xa0)) more = 2; } else if (c < 0xf8) { /* U+10000 - U+10ffff (and 110000 - 1fffff) */ if ((c &= 0x07) || (c1 >= 0x90)) more = 3; } else if (c < 0xfc) { /* ISO 10646 200000 - 3ffffff */ if ((c &= 0x03) || (c1 >= 0x88)) more = 4; } else if (c < 0xfe) { /* ISO 10646 4000000 - 7fffffff */ if ((c &= 0x01) || (c1 >= 0x84)) more = 5; } /* fe and ff never valid */ if (more) { /* multi-octet, make sure more to come */ if (!j) return U8G_ENDSTRI; ret = c; /* continuation needed, save start bits */ } } } while (more); if (!(ret & U8G_ERROR)) { /* success return? */ *s = t; /* yes, update pointer */ *i = j; /* and counter */ } return ret; /* return value */}/* Return UCS-4 character from named charset string * Accepts: charset * pointer to string * remaining octets in string * Returns: UCS-4 character with pointer and count updated, negative if error * * Error codes are the same as utf8_get(). */unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i){ unsigned char c,c1,ku,ten; unsigned long ret,d; unsigned char *t = *s; unsigned long j = *i; struct utf8_eucparam *p1,*p2,*p3; if (j--) c = *t++; /* get first octet */ else return U8G_ENDSTRG; /* empty string */ switch (cs->type) { /* convert if type known */ case CT_UTF8: /* variable UTF-8 encoded Unicode no table */ return utf8_get (s,i); case CT_ASCII: /* 7-bit ASCII no table */ if (c >= 0x80) return U8G_NOTUTF8; case CT_1BYTE0: /* 1 byte no table */ ret = c; /* identity */ break; case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ ret = (c > 0x80) ? ((unsigned short *) cs->tab)[c & BITS7] : c; break; case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ ret = ((unsigned short *) cs->tab)[c]; break; case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ if (c & BIT8) { p1 = (struct utf8_eucparam *) cs->tab; p2 = p1 + 1; p3 = p1 + 2; if (j--) c1 = *t++; /* get second octet */ else return U8G_ENDSTRI; if (!(c1 & BIT8)) return U8G_NOTUTF8; switch (c) { /* check 8bit code set */ case EUC_CS2: /* CS2 */ if (p2->base_ku) { /* CS2 set up? */ if (p2->base_ten) { /* yes, multibyte? */ if (j--) c = *t++; /* get second octet */ else return U8G_ENDSTRI; if ((c & BIT8) && ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) && ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) { ret = ((unsigned short *) p2->tab)[(ku*p2->max_ten) + ten]; break; } } else if ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) { ret = c1 + ((unsigned long) p2->tab); break; } } return U8G_NOTUTF8; /* CS2 not set up or bogus */ case EUC_CS3: /* CS3 */ if (p3->base_ku) { /* CS3 set up? */ if (p3->base_ten) { /* yes, multibyte? */ if (j--) c = *t++; /* get second octet */ else return U8G_ENDSTRI; if ((c & BIT8) && ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) && ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) { ret = ((unsigned short *) p3->tab)[(ku*p3->max_ten) + ten]; break; } } else if ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) { ret = c1 + ((unsigned long) p3->tab); break; } } return U8G_NOTUTF8; /* CS3 not set up or bogus */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -