📄 utf8.c
字号:
/* ======================================================================== * Copyright 1988-2008 University of Washington * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * * ======================================================================== *//* * Program: UTF-8 routines * * Author: Mark Crispin * Networks and Distributed Computing * Computing & Communications * University of Washington * Administration Building, AG-44 * Seattle, WA 98195 * Internet: MRC@CAC.Washington.EDU * * Date: 11 June 1997 * Last Edited: 17 January 2008 */#include <stdio.h>#include <ctype.h>#include "c-client.h"/* *** IMPORTANT *** * * There is a very important difference between "character set" and "charset", * and the comments in this file reflect these differences. A "character set" * (also known as "coded character set") is a mapping between codepoints and * characters. A "charset" is as defined in MIME, and incorporates one or more * coded character sets in a character encoding scheme. See RFC 2130 for more * details. *//* Character set conversion tables */#include "iso_8859.c" /* 8-bit single-byte coded graphic */#include "koi8_r.c" /* Cyrillic - Russia */#include "koi8_u.c" /* Cyrillic - Ukraine */#include "tis_620.c" /* Thai */#include "viscii.c" /* Vietnamese */#include "windows.c" /* Windows */#include "ibm.c" /* IBM */#include "gb_2312.c" /* Chinese (PRC) - simplified */#include "gb_12345.c" /* Chinese (PRC) - traditional */#include "jis_0208.c" /* Japanese - basic */#include "jis_0212.c" /* Japanese - supplementary */#include "ksc_5601.c" /* Korean */#include "big5.c" /* Taiwanese (ROC) - industrial standard */#include "cns11643.c" /* Taiwanese (ROC) - national standard */#include "widths.c" /* Unicode character widths */#include "tmap.c" /* Unicode titlecase mapping */#include "decomtab.c" /* Unicode decomposions *//* EUC parameters */#ifdef GBTOUNICODE /* PRC simplified Chinese */static const struct utf8_eucparam gb_param = { BASE_GB2312_KU,BASE_GB2312_TEN,MAX_GB2312_KU,MAX_GB2312_TEN, (void *) gb2312tab};#endif#ifdef GB12345TOUNICODE /* PRC traditional Chinese */static const struct utf8_eucparam gbt_param = { BASE_GB12345_KU,BASE_GB12345_TEN,MAX_GB12345_KU,MAX_GB12345_TEN, (void *) gb12345tab};#endif#ifdef BIG5TOUNICODE /* ROC traditional Chinese */static const struct utf8_eucparam big5_param[] = { {BASE_BIG5_KU,BASE_BIG5_TEN_0,MAX_BIG5_KU,MAX_BIG5_TEN_0,(void *) big5tab}, {BASE_BIG5_KU,BASE_BIG5_TEN_1,MAX_BIG5_KU,MAX_BIG5_TEN_1,NIL}};#endif#ifdef JISTOUNICODE /* Japanese */static const struct utf8_eucparam jis_param[] = { {BASE_JIS0208_KU,BASE_JIS0208_TEN,MAX_JIS0208_KU,MAX_JIS0208_TEN, (void *) jis0208tab}, {MIN_KANA_8,0,MAX_KANA_8,0,(void *) KANA_8},#ifdef JIS0212TOUNICODE /* Japanese extended */ {BASE_JIS0212_KU,BASE_JIS0212_TEN,MAX_JIS0212_KU,MAX_JIS0212_TEN, (void *) jis0212tab}#else {0,0,0,0,NIL}#endif};#endif#ifdef KSCTOUNICODE /* Korean */static const struct utf8_eucparam ksc_param = { BASE_KSC5601_KU,BASE_KSC5601_TEN,MAX_KSC5601_KU,MAX_KSC5601_TEN, (void *) ksc5601tab};#endif/* List of supported charsets */static const CHARSET utf8_csvalid[] = { {"US-ASCII",CT_ASCII,CF_PRIMARY | CF_DISPLAY | CF_POSTING, NIL,NIL,NIL}, {"UTF-8",CT_UTF8,CF_PRIMARY | CF_DISPLAY | CF_POSTING, NIL,SC_UNICODE,NIL}, {"UTF-7",CT_UTF7,CF_PRIMARY | CF_POSTING | CF_UNSUPRT, NIL,SC_UNICODE,"UTF-8"}, {"ISO-8859-1",CT_1BYTE0,CF_PRIMARY | CF_DISPLAY | CF_POSTING, NIL,SC_LATIN_1,NIL}, {"ISO-8859-2",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_2tab,SC_LATIN_2,NIL}, {"ISO-8859-3",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_3tab,SC_LATIN_3,NIL}, {"ISO-8859-4",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_4tab,SC_LATIN_4,NIL}, {"ISO-8859-5",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_5tab,SC_CYRILLIC,"KOI8-R"}, {"ISO-8859-6",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_6tab,SC_ARABIC,NIL}, {"ISO-8859-7",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_7tab,SC_GREEK,NIL}, {"ISO-8859-8",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_8tab,SC_HEBREW,NIL}, {"ISO-8859-9",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_9tab,SC_LATIN_5,NIL}, {"ISO-8859-10",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_10tab,SC_LATIN_6,NIL}, {"ISO-8859-11",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_11tab,SC_THAI,NIL},#if 0 /* ISO 8859-12 reserved for ISCII(?) */ {"ISO-8859-12",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_12tab,NIL,NIL},#endif {"ISO-8859-13",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_13tab,SC_LATIN_7,NIL}, {"ISO-8859-14",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_14tab,SC_LATIN_8,NIL}, {"ISO-8859-15",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_15tab,SC_LATIN_9,NIL}, {"ISO-8859-16",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) iso8859_16tab,SC_LATIN_10,NIL}, {"KOI8-R",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) koi8rtab,SC_CYRILLIC,NIL}, {"KOI8-U",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,NIL}, {"KOI8-RU",CT_1BYTE,CF_DISPLAY, (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,"KOI8-U"}, {"TIS-620",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) tis620tab,SC_THAI,"ISO-8859-11"}, {"VISCII",CT_1BYTE8,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) visciitab,SC_VIETNAMESE,NIL},#ifdef GBTOUNICODE {"GBK",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) &gb_param,SC_CHINESE_SIMPLIFIED,NIL}, {"GB2312",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"}, {"CN-GB",CT_DBYTE,CF_DISPLAY, (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},#ifdef CNS1TOUNICODE {"ISO-2022-CN",CT_2022,CF_PRIMARY | CF_UNSUPRT, NIL,SC_CHINESE_SIMPLIFIED | SC_CHINESE_TRADITIONAL, NIL},#endif#endif#ifdef GB12345TOUNICODE {"CN-GB-12345",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, (void *) &gbt_param,SC_CHINESE_TRADITIONAL,"BIG5"},#endif#ifdef BIG5TOUNICODE {"BIG5",CT_DBYTE2,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) big5_param,SC_CHINESE_TRADITIONAL,NIL}, {"CN-BIG5",CT_DBYTE2,CF_DISPLAY, (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"}, {"BIG-5",CT_DBYTE2,CF_DISPLAY, (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"},#endif#ifdef JISTOUNICODE {"ISO-2022-JP",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_POSTING, NIL,SC_JAPANESE,NIL}, {"EUC-JP",CT_EUC,CF_PRIMARY | CF_DISPLAY, (void *) jis_param,SC_JAPANESE,"ISO-2022-JP"}, {"SHIFT_JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY, NIL,SC_JAPANESE,"ISO-2022-JP"}, {"SHIFT-JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY, NIL,SC_JAPANESE,"ISO-2022-JP"},#ifdef JIS0212TOUNICODE {"ISO-2022-JP-1",CT_2022,CF_UNSUPRT, NIL,SC_JAPANESE,"ISO-2022-JP"},#ifdef GBTOUNICODE#ifdef KSCTOUNICODE {"ISO-2022-JP-2",CT_2022,CF_UNSUPRT, NIL, SC_LATIN_1 | SC_LATIN_2 | SC_LATIN_3 | SC_LATIN_4 | SC_LATIN_5 | SC_LATIN_6 | SC_LATIN_7 | SC_LATIN_8 | SC_LATIN_9 | SC_LATIN_10 | SC_ARABIC | SC_CYRILLIC | SC_GREEK | SC_HEBREW | SC_THAI | SC_VIETNAMESE | SC_CHINESE_SIMPLIFIED | SC_JAPANESE | SC_KOREAN#ifdef CNS1TOUNICODE | SC_CHINESE_TRADITIONAL#endif ,"UTF-8"},#endif#endif#endif#endif#ifdef KSCTOUNICODE {"ISO-2022-KR",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_UNSUPRT, NIL,SC_KOREAN,"EUC-KR"}, {"EUC-KR",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) &ksc_param,SC_KOREAN,NIL}, {"KSC5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, {"KSC_5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, {"KS_C_5601-1987",CT_DBYTE,CF_DISPLAY, (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, {"KS_C_5601-1989",CT_DBYTE,CF_DISPLAY, (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, {"KS_C_5601-1992",CT_DBYTE,CF_DISPLAY, (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, {"KS_C_5601-1997",CT_DBYTE,CF_DISPLAY, (void *) &ksc_param,SC_KOREAN,"EUC-KR"},#endif /* deep sigh */ {"WINDOWS-874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) windows_874tab,SC_THAI,"ISO-8859-11"}, {"CP874",CT_1BYTE,CF_DISPLAY, (void *) windows_874tab,SC_THAI,"ISO-8859-11"},#ifdef GBTOUNICODE {"WINDOWS-936",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"}, {"CP936",CT_DBYTE,CF_DISPLAY, (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},#endif#ifdef KSCTOUNICODE {"WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, {"CP949",CT_DBYTE,CF_DISPLAY, (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, {"X-WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, (void *) &ksc_param,SC_KOREAN,"EUC-KR"},#endif {"WINDOWS-1250",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"}, {"CP1250",CT_1BYTE,CF_DISPLAY, (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"}, {"WINDOWS-1251",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"}, {"CP1251",CT_1BYTE,CF_DISPLAY, (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"}, {"WINDOWS-1252",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"}, {"CP1252",CT_1BYTE,CF_DISPLAY, (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"}, {"WINDOWS-1253",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"}, {"CP1253",CT_1BYTE,CF_DISPLAY, (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"}, {"WINDOWS-1254",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"}, {"CP1254",CT_1BYTE,CF_DISPLAY, (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"}, {"WINDOWS-1255",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"}, {"CP1255",CT_1BYTE,CF_DISPLAY, (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"}, {"WINDOWS-1256",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"}, {"CP1256",CT_1BYTE,CF_DISPLAY, (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"}, {"WINDOWS-1257",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"}, {"CP1257",CT_1BYTE,CF_DISPLAY, (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"}, {"WINDOWS-1258",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"}, {"CP1258",CT_1BYTE,CF_DISPLAY, (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"}, /* deeper sigh */ {"IBM367",CT_ASCII,CF_PRIMARY | CF_DISPLAY, NIL,NIL,"US-ASCII"}, {"IBM437",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_437tab,SC_LATIN_1,"ISO-8859-1"}, {"IBM737",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_737tab,SC_GREEK,"ISO-8859-7"}, {"IBM775",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_775tab,SC_LATIN_7,"ISO-8859-13"}, {"IBM850",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_850tab,SC_LATIN_1,"ISO-8859-1"}, {"IBM852",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_852tab,SC_LATIN_2,"ISO-8859-2"}, {"IBM855",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_855tab,SC_CYRILLIC,"ISO-8859-5"}, {"IBM857",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_857tab,SC_LATIN_5,"ISO-8859-9"}, {"IBM860",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_860tab,SC_LATIN_1,"ISO-8859-1"}, {"IBM861",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_861tab,SC_LATIN_6,"ISO-8859-10"}, {"IBM862",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_862tab,SC_HEBREW,"ISO-8859-8"}, {"IBM863",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_863tab,SC_LATIN_1,"ISO-8859-1"}, {"IBM864",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_864tab,SC_ARABIC,"ISO-8859-6"}, {"IBM865",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_865tab,SC_LATIN_6,"ISO-8859-10"}, {"IBM866",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_866tab,SC_CYRILLIC,"KOI8-R"}, {"IBM869",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_869tab,SC_GREEK,"ISO-8859-7"}, {"IBM874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, (void *) ibm_874tab,SC_THAI,"ISO-8859-11"}, /* deepest sigh */ {"ANSI_X3.4-1968",CT_ASCII,CF_DISPLAY, NIL,NIL,"US-ASCII"}, {"UNICODE-1-1-UTF-7",CT_UTF7,CF_UNSUPRT, NIL,SC_UNICODE,"UTF-8"}, /* these should never appear in email */ {"UCS-2",CT_UCS2,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL, NIL,SC_UNICODE,"UTF-8"}, {"UCS-4",CT_UCS4,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL, NIL,SC_UNICODE,"UTF-8"}, {"UTF-16",CT_UTF16,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL, NIL,SC_UNICODE,"UTF-8"}, NIL};/* Non-Unicode Script table */static const SCRIPT utf8_scvalid[] = { {"Arabic",NIL,SC_ARABIC}, {"Chinese Simplified","China, Singapore",SC_CHINESE_SIMPLIFIED}, {"Chinese Traditional","Taiwan, Hong Kong, Macao",SC_CHINESE_TRADITIONAL}, {"Cyrillic",NIL,SC_CYRILLIC}, {"Cyrillic Ukranian",NIL,SC_UKRANIAN}, {"Greek",NIL,SC_GREEK}, {"Hebrew",NIL,SC_HEBREW}, {"Japanese",NIL,SC_JAPANESE}, {"Korean",NIL,SC_KOREAN}, {"Latin-1","Western Europe",SC_LATIN_1}, {"Latin-2","Eastern Europe",SC_LATIN_2}, {"Latin-3","Southern Europe",SC_LATIN_3}, {"Latin-4","Northern Europe",SC_LATIN_4}, {"Latin-5","Turkish",SC_LATIN_5}, {"Latin-6","Nordic",SC_LATIN_6}, {"Latin-7","Baltic",SC_LATIN_7}, {"Latin-8","Celtic",SC_LATIN_8}, {"Latin-9","Euro",SC_LATIN_9}, {"Latin-10","Balkan",SC_LATIN_10}, {"Thai",NIL,SC_THAI}, {"Vietnamese",NIL,SC_VIETNAMESE}, NIL};/* Look up script name or return entire table * Accepts: script name or NIL * Returns: pointer to script table entry or NIL if unknown */SCRIPT *utf8_script (char *script)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -