⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 utf8.c

📁 广泛使用的邮件服务器!同时
💻 C
📖 第 1 页 / 共 5 页
字号:
{  unsigned long i;  if (!script) return (SCRIPT *) &utf8_scvalid[0];  else if (*script && (strlen (script) < 128))    for (i = 0; utf8_scvalid[i].name; i++)      if (!compare_cstring (script,utf8_scvalid[i].name))	return (SCRIPT *) &utf8_scvalid[i];  return NIL;			/* failed */}/* Look up charset name or return entire table * Accepts: charset name or NIL * Returns: charset table entry or NIL if unknown */const CHARSET *utf8_charset (char *charset){  unsigned long i;  if (!charset) return (CHARSET *) &utf8_csvalid[0];  else if (*charset && (strlen (charset) < 128))    for (i = 0; utf8_csvalid[i].name; i++)      if (!compare_cstring (charset,utf8_csvalid[i].name))	return (CHARSET *) &utf8_csvalid[i];  return NIL;			/* failed */}/* Validate charset and generate error message if invalid * Accepts: bad character set * Returns: NIL if good charset, else error message string */#define BADCSS "[BADCHARSET ("#define BADCSE ")] Unknown charset: "char *utf8_badcharset (char *charset){  char *msg = NIL;  if (!utf8_charset (charset)) {    char *s,*t;    unsigned long i,j;				/* calculate size of header, trailer, and bad				 * charset plus charset names */    for (i = 0, j = sizeof (BADCSS) + sizeof (BADCSE) + strlen (charset) - 2;	 utf8_csvalid[i].name; i++)      j += strlen (utf8_csvalid[i].name) + 1;				/* not built right */    if (!i) fatal ("No valid charsets!");				/* header */    for (s = msg = (char *) fs_get (j), t = BADCSS; *t; *s++ = *t++);				/* each charset */    for (i = 0; utf8_csvalid[i].name; *s++ = ' ', i++)      for (t = utf8_csvalid[i].name; *t; *s++ = *t++);				/* back over last space, trailer */    for (t = BADCSE, --s; *t; *s++ = *t++);				/* finally bogus charset */    for (t = charset; *t; *s++ = *t++);    *s++ = '\0';		/* finally tie off string */    if (s != (msg + j)) fatal ("charset msg botch");  }  return msg;}/* Convert charset labelled sized text to UTF-8 * Accepts: source sized text *	    charset *	    pointer to returned sized text if non-NIL *	    flags * Returns: T if successful, NIL if failure */long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags){  ucs4cn_t cv = (flags & U8T_CASECANON) ? ucs4_titlecase : NIL;  ucs4de_t de = (flags & U8T_DECOMPOSE) ? ucs4_decompose_recursive : NIL;  const CHARSET *cs = (charset && *charset) ?    utf8_charset (charset) : utf8_infercharset (text);  if (cs) return (text && ret) ? utf8_text_cs (text,cs,ret,cv,de) : LONGT;  if (ret) {			/* no conversion possible */    ret->data = text->data;	/* so return source */    ret->size = text->size;  }  return NIL;			/* failure */}/* Operations used in converting data */#define UTF8_COUNT_BMP(count,c,cv,de) {		\  void *more = NIL;				\  if (cv) c = (*cv) (c);			\  if (de) c = (*de) (c,&more);			\  do count += UTF8_SIZE_BMP(c);			\  while (more && (c = (*de) (U8G_ERROR,&more)));\}#define UTF8_WRITE_BMP(b,c,cv,de) {		\  void *more = NIL;				\  if (cv) c = (*cv) (c);			\  if (de) c = (*de) (c,&more);			\  do UTF8_PUT_BMP (b,c)				\  while (more && (c = (*de) (U8G_ERROR,&more)));\}#define UTF8_COUNT(count,c,cv,de) {		\  void *more = NIL;				\  if (cv) c = (*cv) (c);			\  if (de) c = (*de) (c,&more);			\  do count += utf8_size (c);			\  while (more && (c = (*de) (U8G_ERROR,&more)));\}#define UTF8_WRITE(b,c,cv,de) {			\  void *more = NIL;				\  if (cv) c = (*cv) (c);			\  if (de) c = (*de) (c,&more);			\  do b = utf8_put (b,c);			\  while (more && (c = (*de) (U8G_ERROR,&more)));\}/* Convert sized text to UTF-8 given CHARSET block * Accepts: source sized text *	    CHARSET block *	    pointer to returned sized text  *	    canonicalization function *	    decomposition function * Returns: T if successful, NIL if failure */long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret,		   ucs4cn_t cv,ucs4de_t de){  ret->data = text->data;	/* default to source */  ret->size = text->size;  switch (cs->type) {		/* convert if type known */  case CT_ASCII:		/* 7-bit ASCII no table */  case CT_UTF8:			/* variable UTF-8 encoded Unicode no table */    if (cv || de) utf8_text_utf8 (text,ret,cv,de);    break;  case CT_1BYTE0:		/* 1 byte no table */    utf8_text_1byte0 (text,ret,cv,de);    break;  case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */    utf8_text_1byte (text,ret,cs->tab,cv,de);    break;  case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */    utf8_text_1byte8 (text,ret,cs->tab,cv,de);    break;  case CT_EUC:			/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */    utf8_text_euc (text,ret,cs->tab,cv,de);    break;  case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */    utf8_text_dbyte (text,ret,cs->tab,cv,de);    break;  case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */    utf8_text_dbyte2 (text,ret,cs->tab,cv,de);    break;  case CT_UTF7:			/* variable UTF-7 encoded Unicode no table */    utf8_text_utf7 (text,ret,cv,de);    break;  case CT_UCS2:			/* 2 byte 16-bit Unicode no table */    utf8_text_ucs2 (text,ret,cv,de);    break;  case CT_UCS4:			/* 4 byte 32-bit Unicode no table */    utf8_text_ucs4 (text,ret,cv,de);    break;  case CT_UTF16:		/* variable UTF-16 encoded Unicode no table */    utf8_text_utf16 (text,ret,cv,de);    break;  case CT_2022:			/* variable ISO-2022 encoded no table*/    utf8_text_2022 (text,ret,cv,de);    break;  case CT_SJIS:			/* 2 byte Shift-JIS encoded JIS no table */    utf8_text_sjis (text,ret,cv,de);    break;  default:			/* unknown character set type */    return NIL;  }  return LONGT;			/* return success */}/* Reverse mapping routines * * These routines only support character sets, not all possible charsets.  In * particular, they do not support any Unicode encodings or ISO 2022. * * As a special dispensation, utf8_cstext() and utf8_cstocstext() support * support ISO-2022-JP if EUC-JP can be reverse mapped; and utf8_rmaptext() * will generated ISO-2022-JP using an EUC-JP rmap if flagged to do so. * * No attempt is made to map "equivalent" Unicode characters or Unicode * characters that have the same glyph; nor is there any attempt to handle * combining characters or otherwise do any stringprep.  Maybe later. *//* Convert UTF-8 sized text to charset * Accepts: source sized text *	    destination charset *	    pointer to returned sized text *	    substitute character if not in cs, else NIL to return failure * Returns: T if successful, NIL if failure */long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,		  unsigned long errch){  short iso2022jp = !compare_cstring (charset,"ISO-2022-JP");  unsigned short *rmap = utf8_rmap (iso2022jp ? "EUC-JP" : charset);  return rmap ? utf8_rmaptext (text,rmap,ret,errch,iso2022jp) : NIL;}/* Convert charset labelled sized text to another charset * Accepts: source sized text *	    source charset *	    pointer to returned sized text *	    destination charset *	    substitute character if not in dest cs, else NIL to return failure * Returns: T if successful, NIL if failure * * This routine has the same restricts as utf8_cstext(). */long utf8_cstocstext (SIZEDTEXT *src,char *sc,SIZEDTEXT *dst,char *dc,		      unsigned long errch){  SIZEDTEXT utf8;  const CHARSET *scs,*dcs;  unsigned short *rmap;  long ret = NIL;  long iso2022jp;				/* lookup charsets and reverse map */  if ((dc && (dcs = utf8_charset (dc))) &&      (rmap = (iso2022jp = ((dcs->type == CT_2022) &&			    !compare_cstring (dcs->name,"ISO-2022-JP"))) ?       utf8_rmap ("EUC-JP") : utf8_rmap_cs (dcs)) &&      (scs = (sc && *sc) ? utf8_charset (sc) : utf8_infercharset (src))) {				/* init temporary buffer */    memset (&utf8,NIL,sizeof (SIZEDTEXT));				/* source cs equivalent to dest cs? */    if ((scs->type == dcs->type) && (scs->tab == dcs->tab)) {      dst->data = src->data;	/* yes, just copy pointers */      dst->size = src->size;      ret = LONGT;    }				/* otherwise do the conversion */    else ret = (utf8_text_cs (src,scs,&utf8,NIL,NIL) &&		utf8_rmaptext (&utf8,rmap,dst,errch,iso2022jp));				/* flush temporary buffer */    if (utf8.data && (utf8.data != src->data) && (utf8.data != dst->data))      fs_give ((void **) &utf8.data);  }  return ret;}/* Cached rmap */static const CHARSET *currmapcs = NIL;static unsigned short *currmap = NIL;/* Cache and return map for UTF-8 -> character set * Accepts: character set name * Returns: cached map if character set found, else NIL */unsigned short *utf8_rmap (char *charset){  return (currmapcs && !compare_cstring (charset,currmapcs->name)) ? currmap :    utf8_rmap_cs (utf8_charset (charset));}/* Cache and return map for UTF-8 -> character set given CHARSET block * Accepts: CHARSET block * Returns: cached map if character set found, else NIL */unsigned short *utf8_rmap_cs (const CHARSET *cs){  unsigned short *ret = NIL;  if (!cs);			/* have charset? */  else if (cs == currmapcs) ret = currmap;  else if (ret = utf8_rmap_gen (cs,currmap)) {    currmapcs = cs;    currmap = ret;  }  return ret;}/* Return map for UTF-8 -> character set given CHARSET block * Accepts: CHARSET block *	    old map to recycle * Returns: map if character set found, else NIL */unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap){  unsigned short u,*tab,*rmap;  unsigned int i,m,ku,ten;  struct utf8_eucparam *param,*p2;  switch (cs->type) {		/* is a character set? */  case CT_ASCII:		/* 7-bit ASCII no table */  case CT_1BYTE0:		/* 1 byte no table */  case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */  case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */  case CT_EUC:			/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */  case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */  case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */  case CT_SJIS:			/* 2 byte Shift-JIS */    rmap = oldmap ? oldmap :	/* recycle old map if supplied else make new */      (unsigned short *) fs_get (65536 * sizeof (unsigned short));				/* initialize table for ASCII */    for (i = 0; i < 128; i++) rmap[i] = (unsigned short) i;				/* populate remainder of table with NOCHAR */#define NOCHARBYTE (NOCHAR & 0xff)#if NOCHAR - ((NOCHARBYTE << 8) | NOCHARBYTE)    while (i < 65536) rmap[i++] = NOCHAR;#else    memset (rmap + 128,NOCHARBYTE,(65536 - 128) * sizeof (unsigned short));#endif    break;  default:			/* unsupported charset type */    rmap = NIL;			/* no map possible */  }  if (rmap) {			/* have a map? */    switch (cs->type) {		/* additional reverse map actions */    case CT_1BYTE0:		/* 1 byte no table */      for (i = 128; i < 256; i++) rmap[i] = (unsigned short) i;      break;    case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */      for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++)	if (tab[i & BITS7] != UBOGON) rmap[tab[i & BITS7]] = (unsigned short)i;      break;    case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */      for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++)	if (tab[i] != UBOGON) rmap[tab[i]] = (unsigned short) i;      break;    case CT_EUC:		/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */      for (param = (struct utf8_eucparam *) cs->tab,	     tab = (unsigned short *) param->tab, ku = 0;	   ku < param->max_ku; ku++)	for (ten = 0; ten < param->max_ten; ten++)	  if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)	    rmap[u] = ((ku + param->base_ku) << 8) +	      (ten + param->base_ten) + 0x8080;      break;    case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */      for (param = (struct utf8_eucparam *) cs->tab,	     tab = (unsigned short *) param->tab, ku = 0;	   ku < param->max_ku; ku++)	for (ten = 0; ten < param->max_ten; ten++)	  if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)	    rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);      break;    case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */      param = (struct utf8_eucparam *) cs->tab;      p2 = param + 1;		/* plane 2 parameters */				/* only ten parameters should differ */      if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku))	fatal ("ku definition error for CT_DBYTE2 charset");				/* total codepoints in each ku */      m = param->max_ten + p2->max_ten;      tab = (unsigned short *) param->tab;      for (ku = 0; ku < param->max_ku; ku++) {	for (ten = 0; ten < param->max_ten; ten++)	  if ((u = tab[(ku * m) + ten]) != UBOGON)	    rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);	for (ten = 0; ten < p2->max_ten; ten++)	  if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON)	    rmap[u] = ((ku + param->base_ku) << 8) + (ten + p2->base_ten);      }      break;    case CT_SJIS:		/* 2 byte Shift-JIS */      for (ku = 0; ku < MAX_JIS0208_KU; ku++)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -