lex.c

来自「这是一个同样来自贝尔实验室的和UNIX有着渊源的操作系统, 其简洁的设计和实现易」· C语言代码 · 共 1,498 行 · 第 1/2 页
1,498 行
			if(c < 0)				break;		}		else if(c < ' ') {			if(isspace(c)) {				if(c == '\r') {					// ignore it unless no following '\n',					// in which case treat it like '\n'					c = getchar(ts);					if(c != '\n') {						if(c >= 0)							ungetchar(ts, c);						c = '\n';					}				}			}			else {				if(warn)					fprint(2, "warning: non-whitespace control character %d ignored\n", c);				c = 0;			}		}		else if(c == '<') {			ungetchar(ts, c);			break;		}		if(c != 0) {			buf[j++] = c;			if(j == BIGBUFSIZE-1) {				s = buftostr(s, buf, j);				j = 0;			}		}		c = getchar(ts);	}	s = buftostr(s, buf, j);	if(s == nil)		return -1;	tok = &a[(*pai)++];	tok->tag = Data;	tok->text = s;	tok->attr = nil;	tok->starti = starti;	return Data;}// The rules for lexing scripts are different (ugh).// Gather up everything until see an "</" tagnames[tok] ">"static intgetscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag){	Rune*	s;	int	j;	int	tstarti;	int	savei;	int	c;	int	tag;	int	done;	Token*	tok;	Rune	buf[BIGBUFSIZE];	s = nil;	j = 0;	tstarti = starti;	c = firstc;	done = 0;	while(c >= 0) {		if(c == '<') {			// other browsers ignore stuff to end of line after <!			savei = ts->i;			c = getchar(ts);			if(c == '!') {//				while(c >= 0 && c != '\n' && c != '\r')//					c = getchar(ts);				if(comment(ts) == -1)					break;				if(c == '\r')					c = getchar(ts);				if(c == '\n')					c = getchar(ts);			}			else if(c >= 0) {				backup(ts, savei);				tag = gettag(ts, tstarti, a, pai);				if(tag == -1)					break;				if(tag != Comment)					(*pai)--;				backup(ts, tstarti);				if(tag == findtag + RBRA) {					done = 1;					break;				}				// here tag was not the one we were looking for, so take as regular data				c = getchar(ts);			}		}		if(c < 0)			break;		if(c != 0) {			buf[j++] = c;			if(j == BIGBUFSIZE-1) {				s = buftostr(s, buf, j);				j = 0;			}		}		tstarti = ts->i;		c = getchar(ts);	}	if(done || ts->i == ts->edata) {		s = buftostr(s, buf, j);		tok = &a[(*pai)++];		tok->tag = Data;		tok->text = s;		tok->attr = nil;		tok->starti = starti;		return Data;	}	backup(ts, starti);	return -1;}// We've just seen a '<'.  Gather up stuff to closing '>' (if buffer// ends before then, return -1).// If it's a tag, look up the name, gather the attributes, and return// the appropriate token.// Else it's either just plain data or some kind of ignorable stuff:// return Data or Comment as appropriate.// If it's not a Comment, put it in a[*pai] and bump *pai.static intgettag(TokenSource* ts, int starti, Token* a, int* pai){	int	rbra;	int	ans;	Attr*	al;	int	nexti;	int	c;	int	ti;	int	afnd;	int	attid;	int	quote;	Rune*	val;	int	nv;	int	i;	int	tag;	Token*	tok;	Rune	buf[BIGBUFSIZE];	rbra = 0;	nexti = ts->i;	tok = &a[*pai];	tok->tag = Notfound;	tok->text = nil;	tok->attr = nil;	tok->starti = starti;	c = getchar(ts);	if(c == '/') {		rbra = RBRA;		c = getchar(ts);	}	if(c < 0)		goto eob_done;	if(c >= 256 || !isalpha(c)) {		// not a tag		if(c == '!') {			ans = comment(ts);			if(ans != -1)				return ans;			goto eob_done;		}		else {			backup(ts, nexti);			tok->tag = Data;			tok->text = _Strdup(L"<");			(*pai)++;			return Data;		}	}	// c starts a tagname	buf[0] = c;	i = 1;	while(1) {		c = getchar(ts);		if(c < 0)			goto eob_done;		if(!ISNAMCHAR(c))			break;		// if name is bigger than buf it won't be found anyway...		if(i < BIGBUFSIZE)			buf[i++] = c;	}	if(_lookup(tagtable, Numtags, buf, i, &tag))		tok->tag = tag + rbra;	else		tok->text = _Strndup(buf, i);	// for warning print, in build	// attribute gathering loop	al = nil;	while(1) {		// look for "ws name" or "ws name ws = ws val"  (ws=whitespace)		// skip whitespaceattrloop_continue:		while(c < 256 && isspace(c)) {			c = getchar(ts);			if(c < 0)				goto eob_done;		}		if(c == '>')			goto attrloop_done;		if(c == '<') {			if(warn)				fprint(2, "warning: unclosed tag\n");			ungetchar(ts, c);			goto attrloop_done;		}		if(c >= 256 || !isalpha(c)) {			if(warn)				fprint(2, "warning: expected attribute name\n");			// skipt to next attribute name			while(1) {				c = getchar(ts);				if(c < 0)					goto eob_done;				if(c < 256 && isalpha(c))					goto attrloop_continue;				if(c == '<') {					if(warn)						fprint(2, "warning: unclosed tag\n");					ungetchar(ts, 60);					goto attrloop_done;				}				if(c == '>')					goto attrloop_done;			}		}		// gather attribute name		buf[0] = c;		i = 1;		while(1) {			c = getchar(ts);			if(c < 0)				goto eob_done;			if(!ISNAMCHAR(c))				break;			if(i < BIGBUFSIZE-1)				buf[i++] = c;		}		afnd = _lookup(attrtable, Numattrs, buf, i, &attid);		if(warn && !afnd) {			buf[i] = 0;			fprint(2, "warning: unknown attribute name %S\n", buf);		}		// skip whitespace		while(c < 256 && isspace(c)) {			c = getchar(ts);			if(c < 0)				goto eob_done;		}		if(c != '=') {			if(afnd)				al = newattr(attid, nil, al);			goto attrloop_continue;		}		//# c is '=' here;  skip whitespace		while(1) {			c = getchar(ts);			if(c < 0)				goto eob_done;			if(c >= 256 || !isspace(c))				break;		}		quote = 0;		if(c == '\'' || c == '"') {			quote = c;			c = getchar(ts);			if(c < 0)				goto eob_done;		}		val = nil;		nv = 0;		while(1) {valloop_continue:			if(c < 0)				goto eob_done;			if(c == '>') {				if(quote) {					// c might be part of string (though not good style)					// but if line ends before close quote, assume					// there was an unmatched quote					ti = ts->i;					while(1) {						c = getchar(ts);						if(c < 0)							goto eob_done;						if(c == quote) {							backup(ts, ti);							buf[nv++] = '>';							if(nv == BIGBUFSIZE-1) {								val = buftostr(val, buf, nv);								nv = 0;							}							c = getchar(ts);							goto valloop_continue;						}						if(c == '\n') {							if(warn)								fprint(2, "warning: apparent unmatched quote\n");							backup(ts, ti);							c = '>';							goto valloop_done;						}					}				}				else					goto valloop_done;			}			if(quote) {				if(c == quote) {					c = getchar(ts);					if(c < 0)						goto eob_done;					goto valloop_done;				}				if(c == '\r') {					c = getchar(ts);					goto valloop_continue;				}				if(c == '\t' || c == '\n')					c = ' ';			}			else {				if(c < 256 && isspace(c))					goto valloop_done;			}			if(c == '&') {				c = ampersand(ts);				if(c == -1)					goto eob_done;			}			buf[nv++] = c;			if(nv == BIGBUFSIZE-1) {				val = buftostr(val, buf, nv);				nv = 0;			}			c = getchar(ts);		}valloop_done:		if(afnd) {			val = buftostr(val, buf, nv);			al = newattr(attid, val, al);		}	}attrloop_done:	tok->attr = al;	(*pai)++;	return tok->tag;eob_done:	if(warn)		fprint(2, "warning: incomplete tag at end of page\n");	backup(ts, nexti);	tok->tag = Data;	tok->text = _Strdup(L"<");	return Data;}// We've just read a '<!' at position starti,// so this may be a comment or other ignored section, or it may// be just a literal string if there is no close before end of file// (other browsers do that).// The accepted practice seems to be (note: contrary to SGML spec!):// If see <!--, look for --> to close, or if none, > to close.// If see <!(not --), look for > to close.// If no close before end of file, leave original characters in as literal data.//// If we see ignorable stuff, return Comment.// Else return nil (caller should back up and try again when more data arrives,// unless at end of file, in which case caller should just make '<' a data token).static intcomment(TokenSource* ts){	int	nexti;	int	havecomment;	int	c;	nexti = ts->i;	havecomment = 0;	c = getchar(ts);	if(c == '-') {		c = getchar(ts);		if(c == '-') {			if(findstr(ts, L"-->"))				havecomment = 1;			else				backup(ts, nexti);		}	}	if(!havecomment) {		if(c == '>')			havecomment = 1;		else if(c >= 0) {			if(findstr(ts, L">"))				havecomment = 1;		}	}	if(havecomment)		return Comment;	return -1;}// Look for string s in token source.// If found, return 1, with buffer at next char after s,// else return 0 (caller should back up).static intfindstr(TokenSource* ts, Rune* s){	int	c0;	int	n;	int	nexti;	int	i;	int	c;	c0 = s[0];	n = runestrlen(s);	while(1) {		c = getchar(ts);		if(c < 0)			break;		if(c == c0) {			if(n == 1)				return 1;			nexti = ts->i;			for(i = 1; i < n; i++) {				c = getchar(ts);				if(c < 0)					goto mainloop_done;				if(c != s[i])					break;			}			if(i == n)				return 1;			backup(ts, nexti);		}	}mainloop_done:	return 0;}// We've just read an '&'; look for an entity reference// name, and if found, return translated char.// if there is a complete entity name but it isn't known,// back up to just past the '&' and return '&'.// If the entity can't be completed in the current buffer, back up// to the '&' and return -1.static intampersand(TokenSource* ts){	int	savei;	int	c;	int	fnd;	int	ans;	int	v;	int	k;	Rune	buf[SMALLBUFSIZE];	savei = ts->i;	c = getchar(ts);	fnd = 0;	ans = -1;	if(c == '#') {		c = getchar(ts);		v = 0;		if(c == 'X' || c == 'x')			for(c = getchar(ts); c < 256; c = getchar(ts))				if(c >= '0' && c <= '9')					v = v*16+c-'0';				else if(c >= 'A' && c<= 'F')					v = v*16+c-'A'+10;				else if(c >= 'a' && c <= 'f')					v = v*16+c-'a'+10;				else					break;		else			while(c >= 0) {				if(!(c < 256 && isdigit(c)))					break;				v = v*10 + c - 48;				c = getchar(ts);			}		if(c >= 0) {			if(!(c == ';' || c == '\n' || c == '\r'))				ungetchar(ts, c);			c = v;			if(c == 160)				c = 160;			if(c >= Winstart && c <= Winend) {				c = winchars[c - Winstart];			}			ans = c;			fnd = 1;		}	}	else if(c < 256 && isalpha(c)) {		buf[0] = c;		k = 1;		while(1) {			c = getchar(ts);			if(c < 0)				break;			if(c < 256 && (isalpha(c) || isdigit(c))) {				if(k < SMALLBUFSIZE-1)					buf[k++] = c;			}			else {				if(!(c == ';' || c == '\n' || c == '\r'))					ungetchar(ts, c);				break;			}		}		if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);	}	if(!fnd) {		backup(ts, savei);		ans = '&';	}	return ans;}// Get next char, obeying ts.chset.// Returns -1 if no complete character left before current end of data.static intgetchar(TokenSource* ts){	uchar*	buf;	int	c;	int	n;	int	ok;	Rune	r;	if(ts->i >= ts->edata)		return -1;	buf = ts->data;	c = buf[ts->i];	switch(ts->chset) {	case ISO_8859_1:		if(c >= Winstart && c <= Winend)			c = winchars[c - Winstart];		ts->i++;		break;	case US_Ascii:		if(c > 127) {			if(warn)				fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);		}		ts->i++;		break;	case UTF_8:		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);		n = chartorune(&r, (char*)(buf+ts->i));		if(ok) {			if(warn && c == 0x80)				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);			ts->i += n;			c = r;		}		else {			// not enough bytes in buf to complete utf-8 char			ts->i = ts->edata;	// mark "all used"			c = -1;		}		break;	case Unicode:		if(ts->i < ts->edata - 1) {			//standards say most-significant byte first			c = (c << 8)|(buf[ts->i + 1]);			ts->i += 2;		}		else {			ts->i = ts->edata;	// mark "all used"			c = -1;		}		break;	}	return c;}// Assuming c was the last character returned by getchar, set// things up so that next getchar will get that same character// followed by the current 'next character', etc.static voidungetchar(TokenSource* ts, int c){	int	n;	Rune	r;	char	a[UTFmax];	n = 1;	switch(ts->chset) {	case UTF_8:		if(c >= 128) {			r = c;			n = runetochar(a, &r);		}		break;	case Unicode:		n = 2;		break;	}	ts->i -= n;}// Restore ts so that it is at the state where the index was savei.static voidbackup(TokenSource* ts, int savei){	if(dbglex)		fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);	ts->i = savei;}// Look for value associated with attribute attid in token t.// If there is one, return 1 and put the value in *pans,// else return 0.// If xfer is true, transfer ownership of the string to the caller// (nil it out here); otherwise, caller must duplicate the answer// if it needs to save it.// OK to have pans==0, in which case this is just looking// to see if token is present.int_tokaval(Token* t, int attid, Rune** pans, int xfer){	Attr*	attr;	attr = t->attr;	while(attr != nil) {		if(attr->attid == attid) {			if(pans != nil)				*pans = attr->value;			if(xfer)				attr->value = nil;			return 1;		}		attr = attr->next;	}	if(pans != nil)		*pans = nil;	return 0;}static intTconv(Fmt *f){	Token*	t;	int	i;	int	tag;	char*	srbra;	Rune*	aname;	Rune*	tname;	Attr*	a;	char	buf[BIGBUFSIZE];	t = va_arg(f->args, Token*);	if(t == nil)		sprint(buf, "<null>");	else {		i = 0;		if(dbglex > 1)			i = snprint(buf, sizeof(buf), "[%d]", t->starti);		tag = t->tag;		if(tag == Data) {			i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);		}		else {			srbra = "";			if(tag >= RBRA) {				tag -= RBRA;				srbra = "/";			}			tname = tagnames[tag];			if(tag == Notfound)				tname = L"?";			i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);			for(a = t->attr; a != nil; a = a->next) {				aname = attrnames[a->attid];				i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);				if(a->value != nil)					i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);			}			i += snprint(buf+i, sizeof(buf)-i-1, ">");		}		buf[i] = 0;	}	return fmtstrcpy(f, buf);}// Attrs own their constituent strings, but build may eventually// transfer some values to its items and nil them out in the Attr.static Attr*newattr(int attid, Rune* value, Attr* link){	Attr* ans;	ans = (Attr*)emalloc(sizeof(Attr));	ans->attid = attid;	ans->value = value;	ans->next = link;	return ans;}// Free list of Attrs linked through next fieldstatic voidfreeattrs(Attr* ahead){	Attr* a;	Attr* nexta;	a = ahead;	while(a != nil) {		nexta = a->next;		free(a->value);		free(a);		a = nexta;	}}// Free array of Tokens.// Allocated space might have room for more than n tokens,// but only n of them are initialized.// If caller has transferred ownership of constitutent strings// or attributes, it must have nil'd out the pointers in the Tokens.void_freetokens(Token* tarray, int n){	int i;	Token* t;	if(tarray == nil)		return;	for(i = 0; i < n; i++) {		t = &tarray[i];		free(t->text);		freeattrs(t->attr);	}	free(tarray);}
lex.c - 源码说明

本页面展示了「这是一个同样来自贝尔实验室的和UNIX有着渊源的操作系统, 其简洁的设计和实现易于我们学习和理解」中的 lex.c 源码文件，采用 C语言编程语言编写，共 1,498 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与UNIX相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?