📄 utf8.c

📁 举世闻名的joe记事本源程序
💻 C
字号:
/* *	UTF-8 Utilities *	Copyright *		(C) 2004 Joseph H. Allen * *	This file is part of JOE (Joe's Own Editor) */#include "types.h"/* Under AmigaOS we have setlocale() but don't have langinfo.h and associated stuff, * so we have to disable the whole piece of code */#ifdef __amigaos#undef HAVE_SETLOCALE#endif/* Cygwin has CODESET, but it's crummy */#ifdef __CYGWIN__#undef HAVE_SETLOCALE#endif/* If it looks old, forget it */#ifndef CODESET#undef HAVE_SETLOCALE#endif#if defined(HAVE_LOCALE_H) && defined(HAVE_SETLOCALE)#	include <locale.h>#       include <langinfo.h>#endif/* nl_langinfo(CODESET) is broken on many systems.  If HAVE_SETLOCALE is undefined,   JOE uses a limited internal version instead *//* UTF-8 Encoder * * c is unicode character. * buf is 7 byte buffer- utf-8 coded character is written to this followed by a 0 termination. * returns length (not including terminator). */int utf8_encode(unsigned char *buf,int c){	if (c < 0x80) {		buf[0] = c;		buf[1] = 0;		return 1;	} else if(c < 0x800) {		buf[0] = (0xc0|(c>>6));		buf[1] = (0x80|(c&0x3F));		buf[2] = 0;		return 2;	} else if(c < 0x10000) {		buf[0] = (0xe0|(c>>12));		buf[1] = (0x80|((c>>6)&0x3f));		buf[2] = (0x80|((c)&0x3f));		buf[3] = 0;		return 3;	} else if(c < 0x200000) {		buf[0] = (0xf0|(c>>18));		buf[1] = (0x80|((c>>12)&0x3f));		buf[2] = (0x80|((c>>6)&0x3f));		buf[3] = (0x80|((c)&0x3f));		buf[4] = 0;		return 4;	} else if(c < 0x4000000) {		buf[0] = (0xf8|(c>>24));		buf[1] = (0x80|((c>>18)&0x3f));		buf[2] = (0x80|((c>>12)&0x3f));		buf[3] = (0x80|((c>>6)&0x3f));		buf[4] = (0x80|((c)&0x3f));		buf[5] = 0;		return 5;	} else {		buf[0] = (0xfC|(c>>30));		buf[1] = (0x80|((c>>24)&0x3f));		buf[2] = (0x80|((c>>18)&0x3f));		buf[3] = (0x80|((c>>12)&0x3f));		buf[4] = (0x80|((c>>6)&0x3f));		buf[5] = (0x80|((c)&0x3f));		buf[6] = 0;		return 6;	}}/* UTF-8 Decoder * * Returns 0 - 7FFFFFFF: decoded character *                   -1: character accepted, nothing decoded yet. *                   -2: incomplete sequence *                   -3: no sequence started, but character is between 128 - 191, 254 or 255 */int utf8_decode(struct utf8_sm *utf8_sm,unsigned char c){	if (utf8_sm->state) {		if ((c&0xC0)==0x80) {			utf8_sm->buf[utf8_sm->ptr++] = c;			--utf8_sm->state;			utf8_sm->accu = ((utf8_sm->accu<<6)|(c&0x3F));			if(!utf8_sm->state)				return utf8_sm->accu;		} else {			utf8_sm->state = 0;			return -2;		}	} else if ((c&0xE0)==0xC0) {		/* 192 - 223 */		utf8_sm->buf[0] = c;		utf8_sm->ptr = 1;		utf8_sm->state = 1;		utf8_sm->accu = (c&0x1F);	} else if ((c&0xF0)==0xE0) {		/* 224 - 239 */		utf8_sm->buf[0] = c;		utf8_sm->ptr = 1;		utf8_sm->state = 2;		utf8_sm->accu = (c&0x0F);	} else if ((c&0xF8)==0xF0) {		/* 240 - 247 */		utf8_sm->buf[0] = c;		utf8_sm->ptr = 1;		utf8_sm->state = 3;		utf8_sm->accu = (c&0x07);	} else if ((c&0xFC)==0xF8) {		/* 248 - 251 */		utf8_sm->buf[0] = c;		utf8_sm->ptr = 1;		utf8_sm->state = 4;		utf8_sm->accu = (c&0x03);	} else if ((c&0xFE)==0xFC) {		/* 252 - 253 */		utf8_sm->buf[0] = c;		utf8_sm->ptr = 1;		utf8_sm->state = 5;		utf8_sm->accu = (c&0x01);	} else if ((c&0x80)==0x00) {		/* 0 - 127 */		utf8_sm->buf[0] = c;		utf8_sm->ptr = 1;		utf8_sm->state = 0;		return c;	} else {		/* 128 - 191, 254, 255 */		utf8_sm->ptr = 0;		utf8_sm->state = 0;		return -3;	}	return -1;}/* Initialize state machine */void utf8_init(struct utf8_sm *utf8_sm){	utf8_sm->ptr = 0;	utf8_sm->state = 0;}/* Decode an entire string */int utf8_decode_string(unsigned char *s){	struct utf8_sm sm;	int x;	int c = -1;	utf8_init(&sm);	for(x=0;s[x];++x)		c = utf8_decode(&sm,s[x]);	return c;}/* Decode and advance * * Returns: 0 - 7FFFFFFF: decoded character *  -2: incomplete sequence *  -3: bad start of sequence found. * * p/plen are always advanced in such a way that repeated called to utf8_decode_fwrd do not cause * infinite loops. */int utf8_decode_fwrd(unsigned char **p,int *plen){	struct utf8_sm sm;	unsigned char *s = *p;	int len;	int c = -2; /* Return this on no more input. */	if (plen)		len = *plen;	else		len = -1;	utf8_init(&sm);	while (len) {		c = utf8_decode(&sm, *s);		if (c >= 0) {			/* We've got a character */			--len;			++s;			break;		} else if (c == -2) {			/* Bad sequence detected.  Caller should feed rest of string in again. */			break;		} else if (c == -3) {			/* Bad start of UTF-8 sequence.  We need to eat this char to avoid infinite loops. */			--len;			++s;			/* But we should tell the caller that something bad was found. */			break;		} else {			/* If c is -1, utf8_decode accepted the character, so we should get the next one. */			--len;			++s;		}	}	if (plen)		*plen = len;	*p = s;	return c;}/* For systems (BSD) with no nl_langinfo(CODESET) *//* * This is a quick-and-dirty emulator of the nl_langinfo(CODESET) * function defined in the Single Unix Specification for those systems * (FreeBSD, etc.) that don't have one yet. It behaves as if it had * been called after setlocale(LC_CTYPE, ""), that is it looks at * the locale environment variables. * * http://www.opengroup.org/onlinepubs/7908799/xsh/langinfo.h.html * * Please extend it as needed and suggest improvements to the author. * This emulator will hopefully become redundant soon as * nl_langinfo(CODESET) becomes more widely implemented. * * Since the proposed Li18nux encoding name registry is still not mature, * the output follows the MIME registry where possible: * *   http://www.iana.org/assignments/character-sets * * A possible autoconf test for the availability of nl_langinfo(CODESET) * can be found in * *   http://www.cl.cam.ac.uk/~mgk25/unicode.html#activate * * Markus.Kuhn@cl.cam.ac.uk -- 2002-03-11 * Permission to use, copy, modify, and distribute this software * for any purpose and without fee is hereby granted. The author * disclaims all warranties with regard to this software. * * Latest version: * *   http://www.cl.cam.ac.uk/~mgk25/ucs/langinfo.c */unsigned char *joe_getcodeset(unsigned char *l){  static unsigned char buf[16];  unsigned char *p;    if (l || ((l = (unsigned char *)getenv("LC_ALL"))   && *l) ||      ((l = (unsigned char *)getenv("LC_CTYPE")) && *l) ||      ((l = (unsigned char *)getenv("LANG"))     && *l)) {    /* check standardized locales */    if (!zcmp(l, USTR "C") || !zcmp(l, USTR "POSIX"))      return USTR "ascii";    /* check for encoding name fragment */    if (zstr(l, USTR "UTF") || zstr(l, USTR "utf"))      return USTR "UTF-8";    if ((p = zstr(l, USTR "8859-"))) {      memcpy((char *)buf, "ISO-8859-\0\0", 12);      p += 5;      if (*p >= '0' && *p <= '9') {	buf[9] = *p++;	if (*p >= '0' && *p <= '9') buf[10] = *p++;	return buf;      }    }    if (zstr(l, USTR "KOI8-R")) return USTR "KOI8-R";    if (zstr(l, USTR "KOI8-U")) return USTR "KOI8-U";    if (zstr(l, USTR "620")) return USTR "TIS-620";    if (zstr(l, USTR "2312")) return USTR "GB2312";    if (zstr(l, USTR "HKSCS")) return USTR "Big5HKSCS";   /* no MIME charset */    if (zstr(l, USTR "Big5") || zstr(l, USTR "BIG5")) return USTR "Big5";    if (zstr(l, USTR "GBK")) return USTR "GBK";           /* no MIME charset */    if (zstr(l, USTR "18030")) return USTR "GB18030";     /* no MIME charset */    if (zstr(l, USTR "Shift_JIS") || zstr(l, USTR "SJIS")) return USTR "Shift_JIS";    /* check for conclusive modifier */    if (zstr(l, USTR "euro")) return USTR "ISO-8859-15";    /* check for language (and perhaps country) codes */    if (zstr(l, USTR "zh_TW")) return USTR "Big5";    if (zstr(l, USTR "zh_HK")) return USTR "Big5HKSCS";   /* no MIME charset */    if (zstr(l, USTR "zh")) return USTR "GB2312";    if (zstr(l, USTR "ja")) return USTR "EUC-JP";    if (zstr(l, USTR "ko")) return USTR "EUC-KR";    if (zstr(l, USTR "ru")) return USTR "KOI8-R";    if (zstr(l, USTR "uk")) return USTR "KOI8-U";    if (zstr(l, USTR "pl") || zstr(l, USTR "hr") ||	zstr(l, USTR "hu") || zstr(l, USTR "cs") ||	zstr(l, USTR "sk") || zstr(l, USTR "sl")) return USTR "ISO-8859-2";    if (zstr(l, USTR "eo") || zstr(l, USTR "mt")) return USTR "ISO-8859-3";    if (zstr(l, USTR "el")) return USTR "ISO-8859-7";    if (zstr(l, USTR "he")) return USTR "ISO-8859-8";    if (zstr(l, USTR "tr")) return USTR "ISO-8859-9";    if (zstr(l, USTR "th")) return USTR "TIS-620";      /* or ISO-8859-11 */    if (zstr(l, USTR "lt")) return USTR "ISO-8859-13";    if (zstr(l, USTR "cy")) return USTR "ISO-8859-14";    if (zstr(l, USTR "ro")) return USTR "ISO-8859-2";   /* or ISO-8859-16 */    if (zstr(l, USTR "am") || zstr(l, USTR "vi")) return USTR "UTF-8";    /* Send me further rules if you like, but don't forget that we are     * *only* interested in locale naming conventions on platforms     * that do not already provide an nl_langinfo(CODESET) implementation. */    return USTR "ISO-8859-1"; /* should perhaps be "UTF-8" instead */  }  return USTR "ascii";}/* Initialize locale for JOE */unsigned char *codeset;	/* Codeset of terminal */unsigned char *non_utf8_codeset;			/* Codeset of local language non-UTF-8 */unsigned char *locale_lang;			/* Our local language */struct charmap *locale_map;			/* Character map of terminal, default map for new files */struct charmap *locale_map_non_utf8;			/* Old, non-utf8 version of locale */void joe_locale(){	unsigned char *s, *t, *u;	s=(unsigned char *)getenv("LC_ALL");	if (!s) {		s=(unsigned char *)getenv("LC_CTYPE");		if (!s) {			s=(unsigned char *)getenv("LANG");		}	}	if (s)		s=zdup(s);	else		s=USTR "ascii";	u = zdup(s);	if ((t=zrchr(s,'.')))		*t = 0;	locale_lang = s;#ifdef HAVE_SETLOCALE	setlocale(LC_ALL,(char *)s);	non_utf8_codeset = zdup((unsigned char *)nl_langinfo(CODESET));#else	non_utf8_codeset = joe_getcodeset(s);#endif	/* printf("joe_locale\n"); */#ifdef HAVE_SETLOCALE	/* printf("set_locale\n"); */	setlocale(LC_ALL,"");#ifdef ENABLE_NLS	/* Set up gettext() */	bindtextdomain(PACKAGE, LOCALEDIR);	textdomain(PACKAGE);	/* printf("%s %s %s\n",PACKAGE,LOCALEDIR,joe_gettext("New File")); */#endif	codeset = zdup((unsigned char *)nl_langinfo(CODESET));#else	codeset = joe_getcodeset(u);#endif	locale_map = find_charmap(codeset);	if (!locale_map)		locale_map = find_charmap(USTR "ascii");	locale_map_non_utf8 = find_charmap(non_utf8_codeset);	if (!locale_map_non_utf8)		locale_map_non_utf8 = find_charmap(USTR "ascii");	fdefault.charmap = locale_map;	pdefault.charmap = locale_map;/*	printf("Character set is %s\n",locale_map->name);	for(x=0;x!=128;++x)		printf("%x	space=%d blank=%d alpha=%d alnum=%d punct=%d print=%d\n",		       x,joe_isspace(locale_map,x), joe_isblank(locale_map,x), joe_isalpha_(locale_map,x),		       joe_isalnum_(locale_map,x), joe_ispunct(locale_map,x), joe_isprint(locale_map,x));*//* Use iconv() */#ifdef junk	to_utf = iconv_open("UTF-8", (char *)non_utf8_codeset);	from_utf = iconv_open((char *)non_utf8_codeset, "UTF-8");#endif/* For no locale support */#ifdef junk	locale_map = find_charmap("ascii");	fdefault.charmap = locale_map;	pdefault.charmap = locale_map;#endif	init_gettext(locale_lang);}void to_utf8(struct charmap *map,unsigned char *s,int c){	int d = to_uni(map,c);	if (d==-1)		utf8_encode(s,'?');	else		utf8_encode(s,d);#ifdef junk	/* Iconv() way */	unsigned char buf[10];	unsigned char *ibufp = buf;	unsigned char *obufp = s;	int ibuf_sz=1;	int obuf_sz= 10;	buf[0]=c;	buf[1]=0;	if (to_utf==(iconv_t)-1 ||	    iconv(to_utf,(char **)&ibufp,&ibuf_sz,(char **)&obufp,&obuf_sz)==(size_t)-1) {		s[0]='?';		s[1]=0;	} else {		*obufp = 0;	}#endif}int from_utf8(struct charmap *map,unsigned char *s){	int d = utf8_decode_string(s);	int c = from_uni(map,d);	if (c==-1)		return '?';	else		return c;#ifdef junk	/* Iconv() way */	int ibuf_sz=zlen(s);	unsigned char *ibufp=s;	int obuf_sz=10;	unsigned char obuf[10];	unsigned char *obufp = obuf;	if (from_utf==(iconv_t)-1 ||	    iconv(from_utf,(char **)&ibufp,&ibuf_sz,(char **)&obufp,&obuf_sz)==((size_t)-1))		return '?';	else		return obuf[0];#endif}void my_iconv(unsigned char *dest,struct charmap *dest_map,              unsigned char *src,struct charmap *src_map){	if (dest_map == src_map) {		zcpy (dest, src);		return;	}	if (src_map->type) {		/* src is UTF-8 */		if (dest_map->type) {			/* UTF-8 to UTF-8? */			zcpy (dest, src);		} else {			/* UTF-8 to non-UTF-8 */			while (*src) {				int len = -1;				int c = utf8_decode_fwrd(&src, &len);				if (c >= 0) {					int d = from_uni(dest_map, c);					if (d >= 0)						*dest++ = d;					else						*dest++ = '?';				} else					*dest++ = 'X';			}			*dest = 0;		}	} else {		/* src is not UTF-8 */		if (!dest_map->type) {			/* Non UTF-8 to non-UTF-8 */			while (*src) {				int c = to_uni(src_map, *src++);				int d;				if (c >= 0) {					d = from_uni(dest_map, c);					if (d >= 0)						*dest++ = d;					else						*dest++ = '?';				} else					*dest++ = '?';			}			*dest = 0;		} else {			/* Non-UTF-8 to UTF-8 */			while (*src) {				int c = to_uni(src_map, *src++);				if (c >= 0)					dest += utf8_encode(dest, c);				else					*dest++ = '?';			}			*dest = 0;		}	}}/* Guess character set */int guess_non_utf8;int guess_utf8;struct charmap *guess_map(unsigned char *buf, int len){	unsigned char *p;	int plen;	int c;	int flag;	/* No info? Use default */	if (!len || (!guess_non_utf8 && !guess_utf8))		return locale_map;	/* Does it look like UTF-8? */	p = buf;	plen = len;	c = 0;	flag = 0;	while (plen) {		/* Break if we could possibly run out of data in		   the middle of utf-8 sequence */		if (plen < 7)			break;		if (*p >= 128)			flag = 1;		c = utf8_decode_fwrd(&p, &plen);		if (c < 0)			break;	}	if (flag && c >= 0) {		/* There are characters above 128, and there are no utf-8 errors */		if (locale_map->type || !guess_utf8)			return locale_map;		else			return find_charmap(USTR "utf-8");	}	if (!flag || !guess_non_utf8) {		/* No characters above 128 */		return locale_map;	} else {		/* Choose non-utf8 version of locale */		return locale_map_non_utf8;	}}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -