⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 utf.c

📁 这是一个同样来自贝尔实验室的和UNIX有着渊源的操作系统, 其简洁的设计和实现易于我们学习和理解
💻 C
字号:
#ifdef PLAN9#include	<u.h>#include	<libc.h>#include	<bio.h>#else#include	<sys/types.h>#include	<stdio.h>#include	<stdlib.h>#include	<string.h>#include	<unistd.h>#include	<errno.h>#include	"plan9.h"#endif#include	"hdr.h"/*	the our_* routines are implementations for the corresponding library	routines. for a while, i tried to actually name them wctomb etc	but stopped that after i found a system which made wchar_t an	unsigned char.*/int our_wctomb(char *s, unsigned long wc);int our_mbtowc(unsigned long *p, char *s, unsigned n);int runetoisoutf(char *str, Rune *rune);int fullisorune(char *str, int n);int isochartorune(Rune *rune, char *str);voidutf_in(int fd, long *notused, struct convert *out){	char buf[N];	int i, j, c, n, tot;	ulong l;	USED(notused);	tot = 0;	while((n = read(fd, buf+tot, N-tot)) >= 0){		tot += n;		for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){			c = our_mbtowc(&l, buf+i, tot-i);			if(c == -1){				if(squawk)					EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);				if(clean){					i++;					continue;				}				nerrors++;				l = Runeerror;				c = 1;			}			runes[j++] = l;			i += c;		}		OUT(out, runes, j);		tot -= i;		ninput += i;		if(tot)			memmove(buf, buf+i, tot);		if(n == 0)			break;	}	OUT(out, runes, 0);}voidutf_out(Rune *base, int n, long *notused){	char *p;	Rune *r;	USED(notused);	nrunes += n;	for(r = base, p = obuf; n-- > 0; r++){		p += our_wctomb(p, *r);	}	noutput += p-obuf;	write(1, obuf, p-obuf);}voidisoutf_in(int fd, long *notused, struct convert *out){	char buf[N];	int i, j, c, n, tot;	USED(notused);	tot = 0;	while((n = read(fd, buf+tot, N-tot)) >= 0){		tot += n;		for(i=j=0; i<tot; ){			if(!fullisorune(buf+i, tot-i))				break;			c = isochartorune(&runes[j], buf+i);			if(runes[j] == Runeerror && c == 1){				if(squawk)					EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);				if(clean){					i++;					continue;				}				nerrors++;			}			j++;			i += c;		}		OUT(out, runes, j);		tot -= i;		ninput += i;		if(tot)			memmove(buf, buf+i, tot);		if(n == 0)			break;	}	OUT(out, runes, 0);}voidisoutf_out(Rune *base, int n, long *notused){	char *p;	Rune *r;	USED(notused);	nrunes += n;	for(r = base, p = obuf; n-- > 0; r++)		p += runetoisoutf(p, r);	noutput += p-obuf;	write(1, obuf, p-obuf);}enum{	Char1	= Runeself,	Rune1	= Runeself,	Char21	= 0xA1,		Rune21	= 0x0100,	Char22	= 0xF6,		Rune22	= 0x4016,	Char3	= 0xFC,		Rune3	= 0x10000,	/* really 0x38E2E */	Esc	= 0xBE,		Bad	= Runeerror};static	uchar	U[256];static	uchar	T[256];staticvoidmktable(void){	int i, u;	for(i=0; i<256; i++) {		u = i + (0x5E - 0xA0);		if(i < 0xA0)			u = i + (0xDF - 0x7F);		if(i < 0x7F)			u = i + (0x00 - 0x21);		if(i < 0x21)			u = i + (0xBE - 0x00);		U[i] = u;		T[u] = i;	}}intisochartorune(Rune *rune, char *str){	int c, c1, c2;	long l;	if(U[0] == 0)		mktable();	/*	 * one character sequence	 *	00000-0009F => 00-9F	 */	c = *(uchar*)str;	if(c < Char1) {		*rune = c;		return 1;	}	/*	 * two character sequence	 *	000A0-000FF => A0; A0-FF	 */	c1 = *(uchar*)(str+1);	if(c < Char21) {		if(c1 >= Rune1 && c1 < Rune21) {			*rune = c1;			return 2;		}		goto bad;	}	/*	 * two character sequence	 *	00100-04015 => A1-F5; 21-7E/A0-FF	 */	c1 = U[c1];	if(c1 >= Esc)		goto bad;	if(c < Char22) {		*rune =  (c-Char21)*Esc + c1 + Rune21;		return 2;	}	/*	 * three character sequence	 *	04016-38E2D => A6-FB; 21-7E/A0-FF	 */	c2 = U[*(uchar*)(str+2)];	if(c2 >= Esc)		goto bad;	if(c < Char3) {		l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;		if(l >= Rune3)			goto bad;		*rune = l;		return 3;	}	/*	 * bad decoding	 */bad:	*rune = Bad;	return 1;}intrunetoisoutf(char *str, Rune *rune){	long c;	if(T[0] == 0)		mktable();	/*	 * one character sequence	 *	00000-0009F => 00-9F	 */	c = *rune;	if(c < Rune1) {		str[0] = c;		return 1;	}	/*	 * two character sequence	 *	000A0-000FF => A0; A0-FF	 */	if(c < Rune21) {		str[0] = Char1;		str[1] = c;		return 2;	}	/*	 * two character sequence	 *	00100-04015 => A1-F5; 21-7E/A0-FF	 */	if(c < Rune22) {		c -= Rune21;		str[0] = c/Esc + Char21;		str[1] = T[c%Esc];		return 2;	}	/*	 * three character sequence	 *	04016-38E2D => A6-FB; 21-7E/A0-FF	 */	c -= Rune22;	str[0] = c/(Esc*Esc) + Char22;	str[1] = T[c/Esc%Esc];	str[2] = T[c%Esc];	return 3;}intfullisorune(char *str, int n){	int c;	if(n > 0) {		c = *(uchar*)str;		if(c < Char1)			return 1;		if(n > 1)			if(c < Char22 || n > 2)				return 1;	}	return 0;}#ifdef PLAN9int	errno;#endifenum{	T1	= 0x00,	Tx	= 0x80,	T2	= 0xC0,	T3	= 0xE0,	T4	= 0xF0,	T5	= 0xF8,	T6	= 0xFC,	Bit1	= 7,	Bitx	= 6,	Bit2	= 5,	Bit3	= 4,	Bit4	= 3,	Bit5	= 2,	Bit6	= 2,	Mask1	= (1<<Bit1)-1,	Maskx	= (1<<Bitx)-1,	Mask2	= (1<<Bit2)-1,	Mask3	= (1<<Bit3)-1,	Mask4	= (1<<Bit4)-1,	Mask5	= (1<<Bit5)-1,	Mask6	= (1<<Bit6)-1,	Wchar1	= (1UL<<Bit1)-1,	Wchar2	= (1UL<<(Bit2+Bitx))-1,	Wchar3	= (1UL<<(Bit3+2*Bitx))-1,	Wchar4	= (1UL<<(Bit4+3*Bitx))-1,	Wchar5	= (1UL<<(Bit5+4*Bitx))-1,#ifndef	EILSEQ	EILSEQ	= 123,#endif /* EILSEQ */};intour_wctomb(char *s, unsigned long wc){	if(s == 0)		return 0;		/* no shift states */	if(wc & ~Wchar2) {		if(wc & ~Wchar4) {			if(wc & ~Wchar5) {				/* 6 bytes */				s[0] = T6 | ((wc >> 5*Bitx) & Mask6);				s[1] = Tx | ((wc >> 4*Bitx) & Maskx);				s[2] = Tx | ((wc >> 3*Bitx) & Maskx);				s[3] = Tx | ((wc >> 2*Bitx) & Maskx);				s[4] = Tx | ((wc >> 1*Bitx) & Maskx);				s[5] = Tx |  (wc & Maskx);				return 6;			}			/* 5 bytes */			s[0] = T5 |  (wc >> 4*Bitx);			s[1] = Tx | ((wc >> 3*Bitx) & Maskx);			s[2] = Tx | ((wc >> 2*Bitx) & Maskx);			s[3] = Tx | ((wc >> 1*Bitx) & Maskx);			s[4] = Tx |  (wc & Maskx);			return 5;		}		if(wc & ~Wchar3) {			/* 4 bytes */			s[0] = T4 |  (wc >> 3*Bitx);			s[1] = Tx | ((wc >> 2*Bitx) & Maskx);			s[2] = Tx | ((wc >> 1*Bitx) & Maskx);			s[3] = Tx |  (wc & Maskx);			return 4;		}		/* 3 bytes */		s[0] = T3 |  (wc >> 2*Bitx);		s[1] = Tx | ((wc >> 1*Bitx) & Maskx);		s[2] = Tx |  (wc & Maskx);		return 3;	}	if(wc & ~Wchar1) {		/* 2 bytes */		s[0] = T2 | (wc >> 1*Bitx);		s[1] = Tx | (wc & Maskx);		return 2;	}	/* 1 byte */	s[0] = T1 | wc;	return 1;}intour_mbtowc(unsigned long *p, char *s, unsigned n){	uchar *us;	int c0, c1, c2, c3, c4, c5;	unsigned long wc;	if(s == 0)		return 0;		/* no shift states */	if(n < 1)		goto bad;	us = (uchar*)s;	c0 = us[0];	if(c0 >= T3) {		if(n < 3)			goto bad;		c1 = us[1] ^ Tx;		c2 = us[2] ^ Tx;		if((c1|c2) & T2)			goto bad;		if(c0 >= T5) {			if(n < 5)				goto bad;			c3 = us[3] ^ Tx;			c4 = us[4] ^ Tx;			if((c3|c4) & T2)				goto bad;			if(c0 >= T6) {				/* 6 bytes */				if(n < 6)					goto bad;				c5 = us[5] ^ Tx;				if(c5 & T2)					goto bad;				wc = ((((((((((c0 & Mask6) << Bitx) |					c1) << Bitx) | c2) << Bitx) |					c3) << Bitx) | c4) << Bitx) | c5;				if(wc <= Wchar5)					goto bad;				*p = wc;				return 6;			}			/* 5 bytes */			wc = ((((((((c0 & Mask5) << Bitx) |				c1) << Bitx) | c2) << Bitx) |				c3) << Bitx) | c4;			if(wc <= Wchar4)				goto bad;			*p = wc;			return 5;		}		if(c0 >= T4) {			/* 4 bytes */			if(n < 4)				goto bad;			c3 = us[3] ^ Tx;			if(c3 & T2)				goto bad;			wc = ((((((c0 & Mask4) << Bitx) |				c1) << Bitx) | c2) << Bitx) |				c3;			if(wc <= Wchar3)				goto bad;			*p = wc;			return 4;		}		/* 3 bytes */		wc = ((((c0 & Mask3) << Bitx) |			c1) << Bitx) | c2;		if(wc <= Wchar2)			goto bad;		*p = wc;		return 3;	}	if(c0 >= T2) {		/* 2 bytes */		if(n < 2)			goto bad;		c1 = us[1] ^ Tx;		if(c1 & T2)			goto bad;		wc = ((c0 & Mask2) << Bitx) |			c1;		if(wc <= Wchar1)			goto bad;		*p = wc;		return 2;	}	/* 1 byte */	if(c0 >= Tx)		goto bad;	*p = c0;	return 1;bad:	errno = EILSEQ;	return -1;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -