📄 utf8.c
字号:
* Accepts: source sized text * pointer to return sized text * EUC parameter table * canonicalization function */void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, ucs4de_t de){ unsigned long i; unsigned char *s; unsigned int pass,c,c1,ku,ten; struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab; struct utf8_eucparam *p2 = p1 + 1; struct utf8_eucparam *p3 = p1 + 2; unsigned short *t1 = (unsigned short *) p1->tab; unsigned short *t2 = (unsigned short *) p2->tab; unsigned short *t3 = (unsigned short *) p3->tab; for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) { for (i = 0; i < text->size;) { /* not CS0? */ if ((c = text->data[i++]) & BIT8) { /* yes, must have another high byte */ if ((i >= text->size) || !((c1 = text->data[i++]) & BIT8)) c = UBOGON; /* out of space or bogon */ else switch (c) { /* check 8bit code set */ case EUC_CS2: /* CS2 */ if (p2->base_ku) { /* CS2 set up? */ if (p2->base_ten) /* yes, multibyte? */ c = ((i < text->size) && ((c = text->data[i++]) & BIT8) && ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) && ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) ? t2[(ku*p2->max_ten) + ten] : UBOGON; else c = ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) ? c1 + ((unsigned long) p2->tab) : UBOGON; } else { /* CS2 not set up */ c = UBOGON; /* swallow byte, say bogon */ if (i < text->size) i++; } break; case EUC_CS3: /* CS3 */ if (p3->base_ku) { /* CS3 set up? */ if (p3->base_ten) /* yes, multibyte? */ c = ((i < text->size) && ((c = text->data[i++]) & BIT8) && ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) && ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) ? t3[(ku*p3->max_ten) + ten] : UBOGON; else c = ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) ? c1 + ((unsigned long) p3->tab) : UBOGON; } else { /* CS3 not set up */ c = UBOGON; /* swallow byte, say bogon */ if (i < text->size) i++; } break; default: if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) || ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten)) c = UBOGON; else if (((c = t1[(ku*p1->max_ten) + ten]) == UBOGON) && /* special hack for JIS X 0212: merge rows less than 10 */ ku && (ku < 10) && t3 && p3->base_ten) c = t3[((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten]; } } /* convert if second pass */ if (pass) UTF8_WRITE_BMP (s,c,cv,de) else UTF8_COUNT_BMP (ret->size,c,cv,de); } if (!pass) (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL; }}/* Convert ASCII + double-byte sized text to UTF-8 * Accepts: source sized text * pointer to return sized text * conversion table * canonicalization function */void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, ucs4de_t de){ unsigned long i; unsigned char *s; unsigned int c,c1,ku,ten; struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab; unsigned short *t1 = (unsigned short *) p1->tab; for (ret->size = i = 0; i < text->size;) { if ((c = text->data[i++]) & BIT8) { /* special hack for GBK: 0x80 is Euro */ if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO; else c = ((i < text->size) && (c1 = text->data[i++]) && ((ku = c - p1->base_ku) < p1->max_ku) && ((ten = c1 - p1->base_ten) < p1->max_ten)) ? t1[(ku*p1->max_ten) + ten] : UBOGON; } UTF8_COUNT_BMP (ret->size,c,cv,de) } (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL; for (i = 0; i < text->size;) { if ((c = text->data[i++]) & BIT8) { /* special hack for GBK: 0x80 is Euro */ if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO; else c = ((i < text->size) && (c1 = text->data[i++]) && ((ku = c - p1->base_ku) < p1->max_ku) && ((ten = c1 - p1->base_ten) < p1->max_ten)) ? t1[(ku*p1->max_ten) + ten] : UBOGON; } UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ }}/* Convert ASCII + double byte 2 plane sized text to UTF-8 * Accepts: source sized text * pointer to return sized text * conversion table * canonicalization function */void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, ucs4de_t de){ unsigned long i; unsigned char *s; unsigned int c,c1,ku,ten; struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab; struct utf8_eucparam *p2 = p1 + 1; unsigned short *t = (unsigned short *) p1->tab; for (ret->size = i = 0; i < text->size;) { if ((c = text->data[i++]) & BIT8) { if ((i >= text->size) || !(c1 = text->data[i++])) c = UBOGON; /* out of space or bogon */ else if (c1 & BIT8) /* high vs. low plane */ c = ((ku = c - p2->base_ku) < p2->max_ku && ((ten = c1 - p2->base_ten) < p2->max_ten)) ? t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON; else c = ((ku = c - p1->base_ku) < p1->max_ku && ((ten = c1 - p1->base_ten) < p1->max_ten)) ? t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON; } UTF8_COUNT_BMP (ret->size,c,cv,de) } (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL; for (i = 0; i < text->size;) { if ((c = text->data[i++]) & BIT8) { if ((i >= text->size) || !(c1 = text->data[i++])) c = UBOGON; /* out of space or bogon */ else if (c1 & BIT8) /* high vs. low plane */ c = ((ku = c - p2->base_ku) < p2->max_ku && ((ten = c1 - p2->base_ten) < p2->max_ten)) ? t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON; else c = ((ku = c - p1->base_ku) < p1->max_ku && ((ten = c1 - p1->base_ten) < p1->max_ten)) ? t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON; } UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ }}#ifdef JISTOUNICODE /* Japanese *//* Convert Shift JIS sized text to UTF-8 * Accepts: source sized text * pointer to return sized text * canonicalization function */void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv, ucs4de_t de){ unsigned long i; unsigned char *s; unsigned int c,c1,ku,ten; for (ret->size = i = 0; i < text->size;) { if ((c = text->data[i++]) & BIT8) { /* half-width katakana */ if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8; else if (i >= text->size) c = UBOGON; else { /* Shift-JIS */ c1 = text->data[i++]; SJISTOJIS (c,c1); c = JISTOUNICODE (c,c1,ku,ten); } } /* compromise - do yen sign but not overline */ else if (c == JISROMAN_YEN) c = UCS2_YEN; UTF8_COUNT_BMP (ret->size,c,cv,de) } (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL; for (i = 0; i < text->size;) { if ((c = text->data[i++]) & BIT8) { /* half-width katakana */ if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8; else { /* Shift-JIS */ c1 = text->data[i++]; SJISTOJIS (c,c1); c = JISTOUNICODE (c,c1,ku,ten); } } /* compromise - do yen sign but not overline */ else if (c == JISROMAN_YEN) c = UCS2_YEN; UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ }}#endif/* Convert ISO-2022 sized text to UTF-8 * Accepts: source sized text * pointer to returned sized text * canonicalization function */void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de){ unsigned long i; unsigned char *s; unsigned int pass,state,c,co,gi,gl,gr,g[4],ku,ten; for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) { gi = 0; /* quell compiler warnings */ state = I2S_CHAR; /* initialize engine */ g[0]= g[2] = I2CS_ASCII; /* G0 and G2 are ASCII */ g[1]= g[3] = I2CS_ISO8859_1;/* G1 and G3 are ISO-8850-1 */ gl = I2C_G0; gr = I2C_G1; /* left is G0, right is G1 */ for (i = 0; i < text->size;) { c = text->data[i++]; switch (state) { /* dispatch based upon engine state */ case I2S_ESC: /* ESC seen */ switch (c) { /* process intermediate character */ case I2C_MULTI: /* multibyte character? */ state = I2S_MUL; /* mark multibyte flag seen */ break; case I2C_SS2: /* single shift GL to G2 */ case I2C_SS2_ALT: /* Taiwan SeedNet */ gl |= I2C_SG2; break; case I2C_SS3: /* single shift GL to G3 */ case I2C_SS3_ALT: /* Taiwan SeedNet */ gl |= I2C_SG3; break; case I2C_LS2: /* shift GL to G2 */ gl = I2C_G2; break; case I2C_LS3: /* shift GL to G3 */ gl = I2C_G3; break; case I2C_LS1R: /* shift GR to G1 */ gr = I2C_G1; break; case I2C_LS2R: /* shift GR to G2 */ gr = I2C_G2; break; case I2C_LS3R: /* shift GR to G3 */ gr = I2C_G3; break; case I2C_G0_94: case I2C_G1_94: case I2C_G2_94: case I2C_G3_94: g[gi = c - I2C_G0_94] = (state == I2S_MUL) ? I2CS_94x94 : I2CS_94; state = I2S_INT; /* ready for character set */ break; case I2C_G0_96: case I2C_G1_96: case I2C_G2_96: case I2C_G3_96: g[gi = c - I2C_G0_96] = (state == I2S_MUL) ? I2CS_96x96 : I2CS_96; state = I2S_INT; /* ready for character set */ break; default: /* bogon */ if (pass) *s++ = I2C_ESC,*s++ = c; else ret->size += 2; state = I2S_CHAR; /* return to previous state */ } break; case I2S_MUL: /* ESC $ */ switch (c) { /* process multibyte intermediate character */ case I2C_G0_94: case I2C_G1_94: case I2C_G2_94: case I2C_G3_94: g[gi = c - I2C_G0_94] = I2CS_94x94; state = I2S_INT; /* ready for character set */ break; case I2C_G0_96: case I2C_G1_96: case I2C_G2_96: case I2C_G3_96: g[gi = c - I2C_G0_96] = I2CS_96x96; state = I2S_INT; /* ready for character set */ break; default: /* probably omitted I2CS_94x94 */ g[gi = I2C_G0] = I2CS_94x94 | c; state = I2S_CHAR; /* return to character state */ } break; case I2S_INT: state = I2S_CHAR; /* return to character state */ g[gi] |= c; /* set character set */ break; case I2S_CHAR: /* character data */ switch (c) { case I2C_ESC: /* ESC character */ state = I2S_ESC; /* see if ISO-2022 prefix */ break; case I2C_SI: /* shift GL to G0 */ gl = I2C_G0; break; case I2C_SO: /* shift GL to G1 */ gl = I2C_G1; break; case I2C_SS2_ALT: /* single shift GL to G2 */ case I2C_SS2_ALT_7: gl |= I2C_SG2; break; case I2C_SS3_ALT: /* single shift GL to G3 */ case I2C_SS3_ALT_7: gl |= I2C_SG3; break; default: /* ordinary character */ co = c; /* note original character */ if (gl & (3 << 2)) { /* single shifted? */ gi = g[gl >> 2]; /* get shifted character set */ gl &= 0x3; /* cancel shift */ } /* select left or right half */ else gi = (c & BIT8) ? g[gr] : g[gl]; c &= BITS7; /* make 7-bit */ switch (gi) { /* interpret in character set */ case I2CS_ASCII: /* ASCII */ break; /* easy! */ case I2CS_BRITISH: /* British ASCII */ /* Pound sterling sign */ if (c == BRITISH_POUNDSTERLING) c = UCS2_POUNDSTERLING; break; case I2CS_JIS_ROMAN: /* JIS Roman */ case I2CS_JIS_BUGROM: /* old bugs */ switch (c) { /* two exceptions to ASCII */ case JISROMAN_YEN: /* Yen sign */ c = UCS2_YEN; break; /* overline */ case JISROMAN_OVERLINE: c = UCS2_OVERLINE; break; } break; case I2CS_JIS_KANA: /* JIS hankaku katakana */ if ((c >= MIN_KANA_7) && (c < MAX_KANA_7)) c += KANA_7; break; case I2CS_ISO8859_1: /* Latin-1 (West European) */ c |= BIT8; /* just turn on high bit */ break; case I2CS_ISO8859_2: /* Latin-2 (Czech, Slovak) */ c = iso8859_2tab[c]; break; case I2CS_ISO8859_3: /* Latin-3 (Dutch, Turkish) */ c = iso8859_3tab[c]; break; case I2CS_ISO8859_4: /* Latin-4 (Scandinavian) */ c = iso8859_4tab[c]; break; case I2CS_ISO8859_5: /* Cyrillic */ c = iso8859_5tab[c]; break; case I2CS_ISO8859_6: /* Arabic */ c = iso8859_6tab[c]; break; case I2CS_ISO8859_7: /* Greek */ c = iso8859_7tab[c]; break; case I2CS_ISO8859_8: /* Hebrew */ c = iso8859_8tab[c]; break; case I2CS_ISO8859_9: /* Latin-5 (Finnish, Portuguese) */ c = iso8859_9tab[c]; break; case I2CS_TIS620: /* Thai */ c = tis620tab[c]; break; case I2CS_ISO8859_10: /* Latin-6 (Northern Europe) */ c = iso8859_10ta
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -