📄 utf8.c
字号:
/* * Program: UTF-8 routines * * Author: Mark Crispin * Networks and Distributed Computing * Computing & Communications * University of Washington * Administration Building, AG-44 * Seattle, WA 98195 * Internet: MRC@CAC.Washington.EDU * * Date: 11 June 1997 * Last Edited: 16 October 2000 * * Copyright 2000 by the University of Washington * * Permission to use, copy, modify, and distribute this software and its * documentation for any purpose and without fee is hereby granted, provided * that the above copyright notices appear in all copies and that both the * above copyright notices and this permission notice appear in supporting * documentation, and that the name of the University of Washington not be * used in advertising or publicity pertaining to distribution of the software * without specific, written prior permission. This software is made * available "as is", and * THE UNIVERSITY OF WASHINGTON DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, * WITH REGARD TO THIS SOFTWARE, INCLUDING WITHOUT LIMITATION ALL IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, AND IN * NO EVENT SHALL THE UNIVERSITY OF WASHINGTON BE LIABLE FOR ANY SPECIAL, * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, TORT * (INCLUDING NEGLIGENCE) OR STRICT LIABILITY, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * */#include <stdio.h>#include <ctype.h>#include "mail.h"#include "osdep.h"#include "misc.h"#include "rfc822.h"#include "utf8.h"/* *** IMPORTANT *** * * There is a very important difference between "character set" and "charset", * and the comments in this file reflect these differences. A "character set" * (also known as "coded character set") is a mapping between codepoints and * characters. A "charset" is as defined in MIME, and incorporates one or more * coded character sets in a character encoding scheme. See RFC 2130 for more * details. *//* Character set conversion tables */#include "iso_8859.c" /* 8-bit single-byte coded graphic */#include "koi8_r.c" /* Cyrillic - Russia */#include "koi8_u.c" /* Cyrillic - Ukraine */#include "tis_620.c" /* Thai */#include "viscii.c" /* Vietnamese */#include "windows.c" /* Windows */#include "gb_2312.c" /* Chinese (PRC) - simplified */#include "gb_12345.c" /* Chinese (PRC) - traditional */#include "jis_0208.c" /* Japanese - basic */#include "jis_0212.c" /* Japanese - supplementary */#include "ksc_5601.c" /* Korean */#include "big5.c" /* Taiwanese (ROC) - industrial standard */#include "cns11643.c" /* Taiwanese (ROC) - national standard *//* EUC parameters */#ifdef GBTOUNICODE /* PRC simplified Chinese */static const struct utf8_eucparam gb_param[] = { {BASE_GB2312_KU,BASE_GB2312_TEN,MAX_GB2312_KU,MAX_GB2312_TEN, (void *) gb2312tab}, {0,0,0,0,NIL}, {0,0,0,0,NIL},};#endif#ifdef GB12345TOUNICODE /* PRC traditional Chinese */static const struct utf8_eucparam gbt_param[] = { {BASE_GB12345_KU,BASE_GB12345_TEN,MAX_GB12345_KU,MAX_GB12345_TEN, (void *) gb12345tab}, {0,0,0,0,NIL}, {0,0,0,0,NIL}};#endif#ifdef BIG5TOUNICODE /* ROC traditional Chinese */static const struct utf8_eucparam big5_param[] = { {BASE_BIG5_KU,BASE_BIG5_TEN_0,MAX_BIG5_KU,MAX_BIG5_TEN_0,(void *) big5tab}, {BASE_BIG5_KU,BASE_BIG5_TEN_1,MAX_BIG5_KU,MAX_BIG5_TEN_1,NIL}};#endif#ifdef JISTOUNICODE /* Japanese */static const struct utf8_eucparam jis_param[] = { {BASE_JIS0208_KU,BASE_JIS0208_TEN,MAX_JIS0208_KU,MAX_JIS0208_TEN, (void *) jis0208tab}, {MIN_KANA_8,0,MAX_KANA_8,0,(void *) KANA_8},#ifdef JIS0212TOUNICODE /* Japanese extended */ {BASE_JIS0212_KU,BASE_JIS0212_TEN,MAX_JIS0212_KU,MAX_JIS0212_TEN, (void *) jis0212tab}#else {0,0,0,0,NIL}#endif};#endif#ifdef KSCTOUNICODE /* Korean */static const struct utf8_eucparam ksc_param = { BASE_KSC5601_KU,BASE_KSC5601_TEN,MAX_KSC5601_KU,MAX_KSC5601_TEN,(void *) ksc5601tab};#endif/* List of supported charsets (note: all names must be uppercase!) */static const struct utf8_csent utf8_csvalid[] = { {"US-ASCII",NIL,NIL,NIL,NIL}, {"UTF-8",NIL,NIL,SC_UNICODE,NIL}, {"UTF-7",utf8_text_utf7,NIL,SC_UNICODE,"UTF-8"}, {"ISO-8859-1",utf8_text_8859_1,NIL,SC_LATIN_1,NIL}, {"ISO-8859-2",utf8_text_1byte,(void *) iso8859_2tab,SC_LATIN_2,NIL}, {"ISO-8859-3",utf8_text_1byte,(void *) iso8859_3tab,SC_LATIN_3,NIL}, {"ISO-8859-4",utf8_text_1byte,(void *) iso8859_4tab,SC_LATIN_4,NIL}, {"ISO-8859-5",utf8_text_1byte,(void *) iso8859_5tab,SC_CYRILLIC,"KOI-8"}, {"ISO-8859-6",utf8_text_1byte,(void *) iso8859_6tab,SC_ARABIC,NIL}, {"ISO-8859-7",utf8_text_1byte,(void *) iso8859_7tab,SC_GREEK,NIL}, {"ISO-8859-8",utf8_text_1byte,(void *) iso8859_8tab,SC_HEBREW,NIL}, {"ISO-8859-9",utf8_text_1byte,(void *) iso8859_9tab,SC_LATIN_5,NIL}, {"ISO-8859-10",utf8_text_1byte,(void *) iso8859_10tab,SC_LATIN_6,NIL}, {"ISO-8859-11",utf8_text_1byte,(void *) iso8859_11tab,SC_THAI,NIL},#if 0 /* ISO 8859-12 reserved for ISCII(?) */ {"ISO-8859-12",utf8_text_1byte,(void *) iso8859_12tab,NIL,NIL},#endif {"ISO-8859-13",utf8_text_1byte,(void *) iso8859_13tab,SC_LATIN_7,NIL}, {"ISO-8859-14",utf8_text_1byte,(void *) iso8859_14tab,SC_LATIN_8,NIL}, {"ISO-8859-15",utf8_text_1byte,(void *) iso8859_15tab,SC_LATIN_9,NIL}, {"KOI8-R",utf8_text_1byte,(void *) koi8rtab,SC_CYRILLIC,NIL}, {"KOI8-U",utf8_text_1byte,(void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,NIL}, {"KOI8-RU",utf8_text_1byte,(void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN, "KOI8-U"}, {"TIS-620",utf8_text_1byte,(void *) tis620tab,SC_THAI,NIL}, {"VISCII",utf8_text_1byte8,(void *) visciitab,SC_VIETNAMESE,NIL},#ifdef GBTOUNICODE {"GB2312",utf8_text_euc,(void *) gb_param,SC_CHINESE_SIMPLIFIED,NIL}, {"CN-GB",utf8_text_euc,(void *) gb_param,SC_CHINESE_SIMPLIFIED,"GB2312"},#ifdef CNS1TOUNICODE {"ISO-2022-CN",utf8_text_2022,NIL, SC_CHINESE_SIMPLIFIED | SC_CHINESE_TRADITIONAL,NIL},#endif#endif#ifdef GB12345TOUNICODE {"CN-GB-12345",utf8_text_euc,(void *) gbt_param,SC_CHINESE_TRADITIONAL,NIL},#endif#ifdef BIG5TOUNICODE {"BIG5",utf8_text_dbyte2,(void *) big5_param,SC_CHINESE_TRADITIONAL,NIL}, {"CN-BIG5",utf8_text_dbyte2,(void *) big5_param,SC_CHINESE_TRADITIONAL, "BIG5"},#endif#ifdef JISTOUNICODE {"ISO-2022-JP",utf8_text_2022,NIL,SC_JAPANESE,NIL}, {"EUC-JP",utf8_text_euc,(void *) jis_param,SC_JAPANESE,"ISO-2022-JP"}, {"SHIFT_JIS",utf8_text_sjis,NIL,SC_JAPANESE,"ISO-2022-JP"}, {"SHIFT-JIS",utf8_text_sjis,NIL,SC_JAPANESE,"ISO-2022-JP"},#ifdef JIS0212TOUNICODE {"ISO-2022-JP-1",utf8_text_2022,NIL,SC_JAPANESE,"ISO-2022-JP"},#ifdef GBTOUNICODE#ifdef KSCTOUNICODE {"ISO-2022-JP-2",utf8_text_2022,NIL, SC_LATIN_1 | SC_LATIN_2 | SC_LATIN_3 | SC_LATIN_4 | SC_LATIN_5 | SC_LATIN_6 | SC_LATIN_7 | SC_LATIN_8 | SC_LATIN_9 | SC_ARABIC | SC_CYRILLIC | SC_GREEK | SC_HEBREW | SC_THAI | SC_VIETNAMESE | SC_CHINESE_TRADITIONAL | SC_JAPANESE | SC_KOREAN#ifdef CNS1TOUNICODE | SC_CHINESE_TRADITIONAL#endif ,"UTF-8"},#endif#endif#endif#endif#ifdef KSCTOUNICODE {"ISO-2022-KR",utf8_text_2022,NIL,SC_KOREAN,NIL}, {"EUC-KR",utf8_text_dbyte,(void *) &ksc_param,SC_KOREAN,NIL}, {"KS_C_5601-1987",utf8_text_dbyte,(void *) &ksc_param,SC_KOREAN,NIL}, {"KS_C_5601-1992",utf8_text_dbyte,(void *) &ksc_param,SC_KOREAN,NIL},#endif /* deep sigh */ {"WINDOWS-874",utf8_text_1byte,(void *) windows_874tab,SC_THAI,NIL}, {"WINDOWS-1250",utf8_text_1byte,(void *) windows_1250tab,SC_LATIN_2,NIL}, {"WINDOWS-1251",utf8_text_1byte,(void *) windows_1251tab,SC_CYRILLIC,NIL}, {"WINDOWS-1252",utf8_text_1byte,(void *) windows_1252tab,SC_LATIN_1,NIL}, {"WINDOWS-1253",utf8_text_1byte,(void *) windows_1253tab,SC_GREEK,NIL}, {"WINDOWS-1254",utf8_text_1byte,(void *) windows_1254tab,SC_LATIN_5,NIL}, {"WINDOWS-1255",utf8_text_1byte,(void *) windows_1255tab,SC_HEBREW,NIL}, {"WINDOWS-1256",utf8_text_1byte,(void *) windows_1256tab,SC_ARABIC,NIL}, {"WINDOWS-1257",utf8_text_1byte,(void *) windows_1257tab,SC_LATIN_7,NIL}, {"WINDOWS-1258",utf8_text_1byte,(void *) windows_1258tab,SC_VIETNAMESE,NIL}, NIL};/* Convert charset labelled sized text to UTF-8 * Accepts: source sized text * charset * pointer to returned sized text if non-NIL * flags (currently non-zero if want error for unknown charset) * Returns: T if successful, NIL if failure */long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags){ unsigned long i; char *t,tmp[MAILTMPLEN]; if (ret) { /* default is to just return identity */ ret->data = text->data; ret->size = text->size; } if (!charset || !*charset) { /* missing charset? */ if (ret && (text->size > 2)) for (i = 0; i < text->size - 1; i++) { /* special hack for untagged ISO-2022 */ if ((text->data[i] == '\033') && (text->data[i+1] == '$')) { utf8_text_2022 (text,ret,NIL); break; } /* special hack for "just send 8" cretins */ else if (text->data[i] & BIT8) { utf8_text_8859_1 (text,ret,NIL); break; } } return LONGT; } if (strlen (charset) < 128) /* otherwise look for charset */ for (i = 0, ucase (strcpy (tmp,charset)); utf8_csvalid[i].name; i++) if (!strcmp (tmp,utf8_csvalid[i].name)) { if (ret && utf8_csvalid[i].dsp) (*utf8_csvalid[i].dsp) (text,ret,utf8_csvalid[i].tab); return LONGT; /* success */ } if (flags) { /* charset not found */ strcpy (tmp,"[BADCHARSET ("); for (i = 0, t = tmp + strlen (tmp); utf8_csvalid[i].name; i++,t += strlen (t)) sprintf (t,"%s ",utf8_csvalid[i].name); sprintf (t + strlen (t) - 1,")] Unknown charset: %.80s",charset); mm_log (tmp,ERROR); } return NIL; /* failed */}/* Convert ISO-8859-1 sized text to UTF-8 * Accepts: source sized text * pointer to returned sized text * conversion table */void utf8_text_8859_1 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab){ unsigned long i; unsigned char *s; unsigned int c; for (ret->size = i = 0; i < text->size; ret->size += (text->data[i++] & BIT8) ? 2 : 1); s = ret->data = (unsigned char *) fs_get (ret->size + 1); for (i = 0; i < text->size;) { if ((c = text->data[i++]) & BIT8) { *s++ = 0xc0 | ((c >> 6) & 0x3f); *s++ = BIT8 | (c & 0x3f); } else *s++ = c; /* ASCII character */ }}/* Convert single byte ASCII+8bit character set sized text to UTF-8 * Accepts: source sized text * pointer to return sized text * conversion table */void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab){ unsigned long i; unsigned char *s; unsigned int c; unsigned short *tbl = (unsigned short *) tab; for (ret->size = i = 0; i < text->size; ret->size += UTF8_SIZE (c)) if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7]; s = ret->data = (unsigned char *) fs_get (ret->size + 1); for (i = 0; i < text->size;) { if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7]; UTF8_PUT (s,c) /* convert Unicode to UTF-8 */ }}/* Convert single byte 8bit character set sized text to UTF-8 * Accepts: source sized text * pointer to return sized text * conversion table */void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab){ unsigned long i; unsigned char *s; unsigned int c; unsigned short *tbl = (unsigned short *) tab; for (ret->size = i = 0; i < text->size; ret->size += UTF8_SIZE (c)) c = tbl[text->data[i++]]; s = ret->data = (unsigned char *) fs_get (ret->size + 1); for (i = 0; i < text->size;) { c = tbl[text->data[i++]]; UTF8_PUT (s,c) /* convert Unicode to UTF-8 */ }}/* Convert EUC sized text to UTF-8 * Accepts: source sized text * pointer to return sized text * EUC parameter table */void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab){ unsigned long i; unsigned char *s; unsigned int pass,c,c1,ku,ten; struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab; struct utf8_eucparam *p2 = p1 + 1; struct utf8_eucparam *p3 = p1 + 2; unsigned short *t1 = (unsigned short *) p1->tab; unsigned short *t2 = (unsigned short *) p2->tab; unsigned short *t3 = (unsigned short *) p3->tab; for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) { for (i = 0; i < text->size;) { /* not CS0? */ if ((c = text->data[i++]) & BIT8) { /* yes, must have another high byte */ if ((i >= text->size) || !((c1 = text->data[i++]) & BIT8)) c = BOGON; /* out of space or bogon */ else switch (c) { /* check 8bit code set */ case EUC_CS2: /* CS2 */ if (p2->base_ku) { /* CS2 set up? */ if (p2->base_ten) /* yes, multibyte? */ c = ((i < text->size) && ((c = text->data[i++]) & BIT8) && ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) && ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) ? t2[(ku*p2->max_ten) + ten] : BOGON; else c = ((c1 >= p2->base_ku) && (c1 <= p2->max_ku)) ? c1 + ((unsigned int) p2->tab) : BOGON; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -