📄 utf8.c

📁 广泛使用的邮件服务器！同时
💻 C
📖 第 1 页 / 共 5 页
字号:
	for (ten = 0; ten < MAX_JIS0208_TEN; ten++)	  if ((u = jis0208tab[ku][ten]) != UBOGON) {	    int sku = ku + BASE_JIS0208_KU;	    int sten = ten + BASE_JIS0208_TEN;	    rmap[u] = ((((sku + 1) >> 1) + ((sku < 95) ? 112 : 176)) << 8) +	      sten + ((sku % 2) ? ((sten > 95) ? 32 : 31) : 126);	  }				/* JIS Roman */      rmap[UCS2_YEN] = JISROMAN_YEN;      rmap[UCS2_OVERLINE] = JISROMAN_OVERLINE;				/* JIS hankaku katakana */      for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++)	rmap[UCS2_KATAKANA + u] = MIN_KANA_8 + u;      break;    }				/* hack: map NBSP to SP if otherwise no map */    if (rmap[0x00a0] == NOCHAR) rmap[0x00a0] = rmap[0x0020];  }  return rmap;			/* return map */}/* Convert UTF-8 sized text to charset using rmap * Accepts: source sized text *	    conversion rmap *	    pointer to returned sized text *	    substitute character if not in rmap, else NIL to return failure *	    ISO-2022-JP conversion flag * Returns T if successful, NIL if failure * * This routine doesn't try to convert to all possible charsets; in particular * it doesn't support other Unicode encodings or any ISO 2022 other than * ISO-2022-JP. */long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret,		    unsigned long errch,long iso2022jp){  unsigned long i,u,c;				/* get size of buffer */  if (i = utf8_rmapsize (text,rmap,errch,iso2022jp)) {    unsigned char *s = text->data;    unsigned char *t = ret->data = (unsigned char *) fs_get (i);    ret->size = i - 1;		/* number of octets in destination buffer */				/* start non-zero ISO-2022-JP state at 1 */    if (iso2022jp) iso2022jp = 1;				/* convert string, ignore BOM */    for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {				/* substitute error character for NOCHAR */      if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;      switch (iso2022jp) {	/* depends upon ISO 2022 mode */      case 0:			/* ISO 2022 not in effect */				/* two-byte character */	if (c > 0xff) *t++ = (unsigned char) (c >> 8);				/* single-byte or low-byte of two-byte */	*t++ = (unsigned char) (c & 0xff);	break;      case 1:			/* ISO 2022 Roman */				/* <ch> */	if (c < 0x80) *t++ = (unsigned char) c;	else {			/* JIS character */	  *t++ = I2C_ESC;	/* ESC $ B <hi> <lo> */	  *t++ = I2C_MULTI;	  *t++ = I2CS_94x94_JIS_NEW;	  *t++ = (unsigned char) (c >> 8) & 0x7f;	  *t++ = (unsigned char) c & 0x7f;	  iso2022jp = 2;	/* shift to ISO 2022 JIS */	}	break;      case 2:			/* ISO 2022 JIS */	if (c > 0x7f) {		/* <hi> <lo> */	  *t++ = (unsigned char) (c >> 8) & 0x7f;	  *t++ = (unsigned char) c & 0x7f;	}	else {			/* ASCII character */	  *t++ = I2C_ESC;	/* ESC ( J <ch> */	  *t++ = I2C_G0_94;	  *t++ = I2CS_94_JIS_ROMAN;	  *t++ = (unsigned char) c;	  iso2022jp = 1;	/* shift to ISO 2022 Roman */	}	break;      }    }    if (iso2022jp == 2) {	/* ISO-2022-JP string must end in Roman */      *t++ = I2C_ESC;		/* ESC ( J */      *t++ = I2C_G0_94;      *t++ = I2CS_94_JIS_ROMAN;    }    *t++ = NIL;			/* tie off returned data */    return LONGT;		/* return success */  }  ret->data = NIL;  ret->size = 0;  return NIL;			/* failure */}/* Calculate size of convertsion of UTF-8 sized text to charset using rmap * Accepts: source sized text *	    conversion rmap *	    pointer to returned sized text *	    substitute character if not in rmap, else NIL to return failure *	    ISO-2022-JP conversion flag * Returns size+1 if successful, NIL if failure * * This routine doesn't try to handle to all possible charsets; in particular * it doesn't support other Unicode encodings or any ISO 2022 other than * ISO-2022-JP. */unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap,			     unsigned long errch,long iso2022jp){  unsigned long i,u,c;  unsigned long ret = 1;	/* terminating NUL */  unsigned char *s = text->data;  if (iso2022jp) iso2022jp = 1;	/* start non-zero ISO-2022-JP state at 1 */  for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {    if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))      return NIL;		/* not in BMP, or NOCHAR and no err char */    switch (iso2022jp) {	/* depends upon ISO 2022 mode */    case 0:			/* ISO 2022 not in effect */      ret += (c > 0xff) ? 2 : 1;      break;    case 1:			/* ISO 2022 Roman */      if (c < 0x80) ret += 1;	/* <ch> */      else {			/* JIS character */	ret += 5;		/* ESC $ B <hi> <lo> */	iso2022jp = 2;		/* shift to ISO 2022 JIS */      }      break;    case 2:			/* ISO 2022 JIS */      if (c > 0x7f) ret += 2;	/* <hi> <lo> */      else {			/* ASCII character */	ret += 4;		/* ESC ( J <ch> */	iso2022jp = 1;		/* shift to ISO 2022 Roman */      }      break;    }  }  if (iso2022jp == 2) {		/* ISO-2022-JP string must end in Roman */    ret += 3;			/* ESC ( J */    iso2022jp = 1;		/* reset state to Roman */  }  return ret;}/* Convert UCS-4 to charset using rmap * Accepts: source UCS-4 character(s) *	    numver of UCS-4 characters *	    conversion rmap *	    pointer to returned sized text *	    substitute character if not in rmap, else NIL to return failure * Returns T if successful, NIL if failure * * Currently only supports BMP characters, and does not support ISO-2022 */long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap,		    SIZEDTEXT *ret,unsigned long errch){  long size = ucs4_rmaplen (ucs4,len,rmap,errch);  return (size >= 0) ?		/* build in newly-created buffer */    ucs4_rmapbuf (ret->data = (unsigned char *) fs_get ((ret->size = size) +1),		  ucs4,len,rmap,errch) : NIL;}/* Return size of UCS-4 string converted to other CS via rmap * Accepts: source UCS-4 character(s) *	    numver of UCS-4 characters *	    conversion rmap *	    substitute character if not in rmap, else NIL to return failure * Returns: length if success, negative if failure (no-convert) */long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap,		   unsigned long errch){  long ret;  unsigned long i,u,c;				/* count non-BOM characters */  for (ret = 0,i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {    if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))      return -1;		/* not in BMP, or NOCHAR and no err char? */    ret += (c > 0xff) ? 2 : 1;  }  return ret;}/* Stuff buffer with UCS-4 string converted to other CS via rmap * Accepts: destination buffer *	    source UCS-4 character(s) *	    number of UCS-4 characters *	    conversion rmap *	    substitute character if not in rmap, else NIL to return failure * Returns: T, always */long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len,		   unsigned short *rmap,unsigned long errch){  unsigned long i,u,c;				/* convert non-BOM characters */  for (i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {				/* substitute error character for NOCHAR */    if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;				/* two-byte character? */    if (c > 0xff) *t++ = (unsigned char) (c >> 8);				/* single-byte or low-byte of two-byte */    *t++ = (unsigned char) (c & 0xff);  }  *t++ = NIL;			/* tie off returned data */  return LONGT;}/* Return UCS-4 Unicode character from UTF-8 string * Accepts: pointer to string *	    remaining octets in string * Returns: UCS-4 character with pointer and count updated *	    or error code with pointer and count unchanged */unsigned long utf8_get (unsigned char **s,unsigned long *i){  unsigned char *t = *s;  unsigned long j = *i;				/* decode raw UTF-8 string */  unsigned long ret = utf8_get_raw (&t,&j);  if (ret & U8G_ERROR);		/* invalid raw UTF-8 decoding? */				/* no, is it surrogate? */  else if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) ret = U8G_SURROGA;				/* or in non-Unicode ISO 10646 space? */  else if (ret > UCS4_MAXUNICODE) ret = U8G_NOTUNIC;  else {    *s = t;			/* all is well, update pointer */    *i = j;			/* and counter */  }  return ret;			/* return value */}/* Return raw (including non-Unicode) UCS-4 character from UTF-8 string * Accepts: pointer to string *	    remaining octets in string * Returns: UCS-4 character with pointer and count updated *	    or error code with pointer and count unchanged */unsigned long utf8_get_raw (unsigned char **s,unsigned long *i){  unsigned char c,c1;  unsigned char *t = *s;  unsigned long j = *i;  unsigned long ret = U8G_NOTUTF8;  int more = 0;  do {				/* make sure have source octets available */    if (!j--) return more ? U8G_ENDSTRI : U8G_ENDSTRG;				/* UTF-8 continuation? */    else if (((c = *t++) > 0x7f) && (c < 0xc0)) {				/* continuation when not in progress */      if (!more) return U8G_BADCONT;      --more;			/* found a continuation octet */      ret <<= 6;		/* shift current value by 6 bits */      ret |= c & 0x3f;		/* merge continuation octet */    }				/* incomplete UTF-8 character */    else if (more) return U8G_INCMPLT;    else {			/* start of sequence */      c1 = j ? *t : 0xbf;	/* assume valid continuation if incomplete */      if (c < 0x80) ret = c;	/* U+0000 - U+007f */      else if (c < 0xc2);	/* c0 and c1 never valid */      else if (c < 0xe0) {	/* U+0080 - U+07ff */	if (c &= 0x1f) more = 1;      }      else if (c < 0xf0) {	/* U+0800 - U+ffff */	if ((c &= 0x0f) || (c1 >= 0xa0)) more = 2;      }      else if (c < 0xf8) {	/* U+10000 - U+10ffff (and 110000 - 1fffff) */	if ((c &= 0x07) || (c1 >= 0x90)) more = 3;      }      else if (c < 0xfc) {	/* ISO 10646 200000 - 3ffffff */	if ((c &= 0x03) || (c1 >= 0x88)) more = 4;      }      else if (c < 0xfe) {	/* ISO 10646 4000000 - 7fffffff */	if ((c &= 0x01) || (c1 >= 0x84)) more = 5;      }				/* fe and ff never valid */      if (more) {		/* multi-octet, make sure more to come */	if (!j) return U8G_ENDSTRI;	ret = c;		/* continuation needed, save start bits */      }    }  } while (more);  if (!(ret & U8G_ERROR)) {	/* success return? */    *s = t;			/* yes, update pointer */    *i = j;			/* and counter */  }  return ret;			/* return value */}/* Return UCS-4 character from named charset string * Accepts: charset *	    pointer to string *	    remaining octets in string * Returns: UCS-4 character with pointer and count updated, negative if error * * Error codes are the same as utf8_get(). */unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i){  unsigned char c,c1,ku,ten;  unsigned long ret,d;  unsigned char *t = *s;  unsigned long j = *i;  struct utf8_eucparam *p1,*p2,*p3;  if (j--) c = *t++;		/* get first octet */  else return U8G_ENDSTRG;	/* empty string */  switch (cs->type) {		/* convert if type known */  case CT_UTF8:			/* variable UTF-8 encoded Unicode no table */    return utf8_get (s,i);  case CT_ASCII:		/* 7-bit ASCII no table */    if (c >= 0x80) return U8G_NOTUTF8;  case CT_1BYTE0:		/* 1 byte no table */    ret = c;			/* identity */    break;  case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */    ret = (c > 0x80) ? ((unsigned short *) cs->tab)[c & BITS7] : c;    break;  case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */    ret = ((unsigned short *) cs->tab)[c];    break;  case CT_EUC:			/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */    if (c & BIT8) {      p1 = (struct utf8_eucparam *) cs->tab;      p2 = p1 + 1;      p3 = p1 + 2;      if (j--) c1 = *t++;	/* get second octet */      else return U8G_ENDSTRI;      if (!(c1 & BIT8)) return U8G_NOTUTF8;      switch (c) {		/* check 8bit code set */      case EUC_CS2:		/* CS2 */	if (p2->base_ku) {	/* CS2 set up? */	  if (p2->base_ten) {	/* yes, multibyte? */	    if (j--) c = *t++;	/* get second octet */	    else return U8G_ENDSTRI;	    if ((c & BIT8) &&		((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) &&		((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) {	      ret = ((unsigned short *) p2->tab)[(ku*p2->max_ten) + ten];	      break;	    }	  }	  else if ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) {	    ret = c1 + ((unsigned long) p2->tab);	    break;	  }	}	return U8G_NOTUTF8;	/* CS2 not set up or bogus */      case EUC_CS3:		/* CS3 */	if (p3->base_ku) {	/* CS3 set up? */	  if (p3->base_ten) {	/* yes, multibyte? */	    if (j--) c = *t++;	/* get second octet */	    else return U8G_ENDSTRI;	    if ((c & BIT8) &&		((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) &&		((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) {	      ret = ((unsigned short *) p3->tab)[(ku*p3->max_ten) + ten];	      break;	    }	  }	  else if ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) {	    ret = c1 + ((unsigned long) p3->tab);	    break;	  }	}	return U8G_NOTUTF8;	/* CS3 not set up or bogus */
💿 文件大小 1945 K
👤 上传用户 anniesprite
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#服务器
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -