⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lex.c

📁 这是一个同样来自贝尔实验室的和UNIX有着渊源的操作系统, 其简洁的设计和实现易于我们学习和理解
💻 C
📖 第 1 页 / 共 2 页
字号:
#include <u.h>#include <libc.h>#include <draw.h>#include <ctype.h>#include <html.h>#include "impl.h"typedef struct TokenSource TokenSource;struct TokenSource{	int			i;		// index of next byte to use	uchar*		data;		// all the data	int			edata;	// data[0:edata] is valid	int			chset;	// one of US_Ascii, etc.	int			mtype;	// TextHtml or TextPlain};enum {	EOF = -2,	EOB = -1};#define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))#define SMALLBUFSIZE 240#define BIGBUFSIZE 2000// HTML 4.0 tag names.// Keep sorted, and in correspondence with enum in iparse.h.Rune* tagnames[] = {	L" ",	L"!",	L"a", 	L"abbr",	L"acronym",	L"address",	L"applet", 	L"area",	L"b",	L"base",	L"basefont",	L"bdo",	L"big",	L"blink",	L"blockquote",	L"body",	L"bq",	L"br",	L"button",	L"caption",	L"center",	L"cite",	L"code",	L"col",	L"colgroup",	L"dd",	L"del",	L"dfn",	L"dir",	L"div",	L"dl",	L"dt",	L"em",	L"fieldset",	L"font",	L"form",	L"frame",	L"frameset",	L"h1",	L"h2",	L"h3",	L"h4",	L"h5",	L"h6",	L"head",	L"hr",	L"html",	L"i",	L"iframe",	L"img",	L"input",	L"ins",	L"isindex",	L"kbd",	L"label",	L"legend",	L"li",	L"link",	L"map",	L"menu",	L"meta",	L"nobr",	L"noframes",	L"noscript",	L"object",	L"ol",	L"optgroup",	L"option",	L"p",	L"param",	L"pre",	L"q",	L"s",	L"samp",	L"script",	L"select",	L"small",	L"span",	L"strike",	L"strong",	L"style",	L"sub",	L"sup",	L"table",	L"tbody",	L"td",	L"textarea",	L"tfoot",	L"th",	L"thead",	L"title",	L"tr",	L"tt",	L"u",	L"ul",	L"var"};// HTML 4.0 attribute names.// Keep sorted, and in correspondence with enum in impl.h.Rune* attrnames[] = {	L"abbr",	L"accept-charset",	L"access-key",	L"action",	L"align",	L"alink",	L"alt",	L"archive",	L"axis",	L"background",	L"bgcolor",	L"border",	L"cellpadding",	L"cellspacing",	L"char",	L"charoff",	L"charset",	L"checked",	L"cite",	L"class",	L"classid",	L"clear",	L"code",	L"codebase",	L"codetype",	L"color",	L"cols",	L"colspan",	L"compact",	L"content",	L"coords",	L"data",	L"datetime",	L"declare",	L"defer",	L"dir",	L"disabled",	L"enctype",	L"face",	L"for",	L"frame",	L"frameborder",	L"headers",	L"height",	L"href",	L"hreflang",	L"hspace",	L"http-equiv",	L"id",	L"ismap",	L"label",	L"lang",	L"link",	L"longdesc",	L"marginheight",	L"marginwidth",	L"maxlength",	L"media",	L"method",	L"multiple",	L"name",	L"nohref",	L"noresize",	L"noshade",	L"nowrap",	L"object",	L"onblur",	L"onchange",	L"onclick",	L"ondblclick",	L"onfocus",	L"onkeypress",	L"onkeyup",	L"onload",	L"onmousedown",	L"onmousemove",	L"onmouseout",	L"onmouseover",	L"onmouseup",	L"onreset",	L"onselect",	L"onsubmit",	L"onunload",	L"profile",	L"prompt",	L"readonly",	L"rel",	L"rev",	L"rows",	L"rowspan",	L"rules",	L"scheme",	L"scope",	L"scrolling",	L"selected",	L"shape",	L"size",	L"span",	L"src",	L"standby",	L"start",	L"style",	L"summary",	L"tabindex",	L"target",	L"text",	L"title",	L"type",	L"usemap",	L"valign",	L"value",	L"valuetype",	L"version",	L"vlink",	L"vspace",	L"width"};// Character entity to unicode character number map.// Keep sorted by name.StringInt	chartab[]= {	{L"AElig", 198},	{L"Aacute", 193},	{L"Acirc", 194},	{L"Agrave", 192},	{L"Alpha", 913},	{L"Aring", 197},	{L"Atilde", 195},	{L"Auml", 196},	{L"Beta", 914},	{L"Ccedil", 199},	{L"Chi", 935},	{L"Dagger", 8225},	{L"Delta", 916},	{L"ETH", 208},	{L"Eacute", 201},	{L"Ecirc", 202},	{L"Egrave", 200},	{L"Epsilon", 917},	{L"Eta", 919},	{L"Euml", 203},	{L"Gamma", 915},	{L"Iacute", 205},	{L"Icirc", 206},	{L"Igrave", 204},	{L"Iota", 921},	{L"Iuml", 207},	{L"Kappa", 922},	{L"Lambda", 923},	{L"Mu", 924},	{L"Ntilde", 209},	{L"Nu", 925},	{L"OElig", 338},	{L"Oacute", 211},	{L"Ocirc", 212},	{L"Ograve", 210},	{L"Omega", 937},	{L"Omicron", 927},	{L"Oslash", 216},	{L"Otilde", 213},	{L"Ouml", 214},	{L"Phi", 934},	{L"Pi", 928},	{L"Prime", 8243},	{L"Psi", 936},	{L"Rho", 929},	{L"Scaron", 352},	{L"Sigma", 931},	{L"THORN", 222},	{L"Tau", 932},	{L"Theta", 920},	{L"Uacute", 218},	{L"Ucirc", 219},	{L"Ugrave", 217},	{L"Upsilon", 933},	{L"Uuml", 220},	{L"Xi", 926},	{L"Yacute", 221},	{L"Yuml", 376},	{L"Zeta", 918},	{L"aacute", 225},	{L"acirc", 226},	{L"acute", 180},	{L"aelig", 230},	{L"agrave", 224},	{L"alefsym", 8501},	{L"alpha", 945},	{L"amp", 38},	{L"and", 8743},	{L"ang", 8736},	{L"aring", 229},	{L"asymp", 8776},	{L"atilde", 227},	{L"auml", 228},	{L"bdquo", 8222},	{L"beta", 946},	{L"brvbar", 166},	{L"bull", 8226},	{L"cap", 8745},	{L"ccedil", 231},	{L"cdots", 8943},	{L"cedil", 184},	{L"cent", 162},	{L"chi", 967},	{L"circ", 710},	{L"clubs", 9827},	{L"cong", 8773},	{L"copy", 169},	{L"crarr", 8629},	{L"cup", 8746},	{L"curren", 164},	{L"dArr", 8659},	{L"dagger", 8224},	{L"darr", 8595},	{L"ddots", 8945},	{L"deg", 176},	{L"delta", 948},	{L"diams", 9830},	{L"divide", 247},	{L"eacute", 233},	{L"ecirc", 234},	{L"egrave", 232},	{L"emdash", 8212},	/* non-standard but commonly used */	{L"empty", 8709},	{L"emsp", 8195},	{L"endash", 8211},	/* non-standard but commonly used */	{L"ensp", 8194},	{L"epsilon", 949},	{L"equiv", 8801},	{L"eta", 951},	{L"eth", 240},	{L"euml", 235},	{L"euro", 8364},	{L"exist", 8707},	{L"fnof", 402},	{L"forall", 8704},	{L"frac12", 189},	{L"frac14", 188},	{L"frac34", 190},	{L"frasl", 8260},	{L"gamma", 947},	{L"ge", 8805},	{L"gt", 62},	{L"hArr", 8660},	{L"harr", 8596},	{L"hearts", 9829},	{L"hellip", 8230},	{L"iacute", 237},	{L"icirc", 238},	{L"iexcl", 161},	{L"igrave", 236},	{L"image", 8465},	{L"infin", 8734},	{L"int", 8747},	{L"iota", 953},	{L"iquest", 191},	{L"isin", 8712},	{L"iuml", 239},	{L"kappa", 954},	{L"lArr", 8656},	{L"lambda", 955},	{L"lang", 9001},	{L"laquo", 171},	{L"larr", 8592},	{L"lceil", 8968},	{L"ldots", 8230},	{L"ldquo", 8220},	{L"le", 8804},	{L"lfloor", 8970},	{L"lowast", 8727},	{L"loz", 9674},	{L"lrm", 8206},	{L"lsaquo", 8249},	{L"lsquo", 8216},	{L"lt", 60},	{L"macr", 175},	{L"mdash", 8212},	{L"micro", 181},	{L"middot", 183},	{L"minus", 8722},	{L"mu", 956},	{L"nabla", 8711},	{L"nbsp", 160},	{L"ndash", 8211},	{L"ne", 8800},	{L"ni", 8715},	{L"not", 172},	{L"notin", 8713},	{L"nsub", 8836},	{L"ntilde", 241},	{L"nu", 957},	{L"oacute", 243},	{L"ocirc", 244},	{L"oelig", 339},	{L"ograve", 242},	{L"oline", 8254},	{L"omega", 969},	{L"omicron", 959},	{L"oplus", 8853},	{L"or", 8744},	{L"ordf", 170},	{L"ordm", 186},	{L"oslash", 248},	{L"otilde", 245},	{L"otimes", 8855},	{L"ouml", 246},	{L"para", 182},	{L"part", 8706},	{L"permil", 8240},	{L"perp", 8869},	{L"phi", 966},	{L"pi", 960},	{L"piv", 982},	{L"plusmn", 177},	{L"pound", 163},	{L"prime", 8242},	{L"prod", 8719},	{L"prop", 8733},	{L"psi", 968},	{L"quad", 8193},	{L"quot", 34},	{L"rArr", 8658},	{L"radic", 8730},	{L"rang", 9002},	{L"raquo", 187},	{L"rarr", 8594},	{L"rceil", 8969},	{L"rdquo", 8221},	{L"real", 8476},	{L"reg", 174},	{L"rfloor", 8971},	{L"rho", 961},	{L"rlm", 8207},	{L"rsaquo", 8250},	{L"rsquo", 8217},	{L"sbquo", 8218},	{L"scaron", 353},	{L"sdot", 8901},	{L"sect", 167},	{L"shy", 173},	{L"sigma", 963},	{L"sigmaf", 962},	{L"sim", 8764},	{L"sp", 8194},	{L"spades", 9824},	{L"sub", 8834},	{L"sube", 8838},	{L"sum", 8721},	{L"sup", 8835},	{L"sup1", 185},	{L"sup2", 178},	{L"sup3", 179},	{L"supe", 8839},	{L"szlig", 223},	{L"tau", 964},	{L"there4", 8756},	{L"theta", 952},	{L"thetasym", 977},	{L"thinsp", 8201},	{L"thorn", 254},	{L"tilde", 732},	{L"times", 215},	{L"trade", 8482},	{L"uArr", 8657},	{L"uacute", 250},	{L"uarr", 8593},	{L"ucirc", 251},	{L"ugrave", 249},	{L"uml", 168},	{L"upsih", 978},	{L"upsilon", 965},	{L"uuml", 252},	{L"varepsilon", 8712},	{L"varphi", 981},	{L"varpi", 982},	{L"varrho", 1009},	{L"vdots", 8942},	{L"vsigma", 962},	{L"vtheta", 977},	{L"weierp", 8472},	{L"xi", 958},	{L"yacute", 253},	{L"yen", 165},	{L"yuml", 255},	{L"zeta", 950},	{L"zwj", 8205},	{L"zwnj", 8204}};#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))// Characters Winstart..Winend are those that Windows// uses interpolated into the Latin1 set.// They aren't supposed to appear in HTML, but they do....enum {	Winstart = 127,	Winend = 159};static int	winchars[]= { 8226,	// 8226 is a bullet	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,	710, 8240, 352, 8249, 338, 8226, 8226, 8226,	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,	732, 8482, 353, 8250, 339, 8226, 8226, 376};static StringInt*	tagtable;		// initialized from tagnamesstatic StringInt*	attrtable;		// initialized from attrnamesstatic void		lexinit();static int		getplaindata(TokenSource* ts, Token* a, int* pai);static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);static Rune*	buftostr(Rune* s, Rune* buf, int j);static int		comment(TokenSource* ts);static int		findstr(TokenSource* ts, Rune* s);static int		ampersand(TokenSource* ts);static int		lowerc(int c);static int		getchar(TokenSource* ts);static void		ungetchar(TokenSource* ts, int c);static void		backup(TokenSource* ts, int savei);static void		freeinsidetoken(Token* t);static void		freeattrs(Attr* ahead);static Attr*	newattr(int attid, Rune* value, Attr* link);static int		Tconv(Fmt* f);int	dbglex = 0;static int lexinited = 0;static voidlexinit(void){	tagtable = _makestrinttab(tagnames, Numtags);	attrtable = _makestrinttab(attrnames, Numattrs);	fmtinstall('T', Tconv);	lexinited = 1;}static TokenSource*newtokensource(uchar* data, int edata, int chset, int mtype){	TokenSource*	ans;	assert(chset == US_Ascii || chset == ISO_8859_1 ||			chset == UTF_8 || chset == Unicode);	ans = (TokenSource*)emalloc(sizeof(TokenSource));	ans->i = 0;	ans->data = data;	ans->edata = edata;	ans->chset = chset;	ans->mtype = mtype;	return ans;}enum {	ToksChunk = 500};// Call this to get the tokens.//  The number of returned tokens is returned in *plen.Token*_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen){	TokenSource*	ts;	Token*		a;	int	alen;	int	ai;	int	starti;	int	c;	int	tag;	if(!lexinited)		lexinit();	ts = newtokensource(data, datalen, chset, mtype);	alen = ToksChunk;	a = (Token*)emalloc(alen * sizeof(Token));	ai = 0;	if(dbglex)		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);	if(ts->mtype == TextHtml) {		for(;;) {			if(ai == alen) {				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));				alen += ToksChunk;			}			starti = ts->i;			c = getchar(ts);			if(c < 0)				break;			if(c == '<') {				tag = gettag(ts, starti, a, &ai);				if(tag == Tscript || tag == Tstyle) {					// special rules for getting Data after....					starti = ts->i;					c = getchar(ts);					tag = getscriptdata(ts, c, starti, a, &ai, tag);				}			}			else				tag = getdata(ts, c, starti, a, &ai);			if(tag == -1)				break;			else if(dbglex > 1 && tag != Comment)				fprint(2, "lex: got token %T\n", &a[ai-1]);		}	}	else {		// plain text (non-html) tokens		for(;;) {			if(ai == alen) {				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));				alen += ToksChunk;			}			tag = getplaindata(ts, a, &ai);			if(tag == -1)				break;			if(dbglex > 1)				fprint(2, "lex: got token %T\n", &a[ai]);		}	}	free(ts);	if(dbglex)		fprint(2, "lex: returning %d tokens\n", ai);	*plen = ai;	if(ai == 0) 		return nil;	return a;}// For case where source isn't HTML.// Just make data tokens, one per line (or partial line,// at end of buffer), ignoring non-whitespace control// characters and dumping \r's.// If find non-empty token, fill in a[*pai], bump *pai, and return Data.// Otherwise return -1;static intgetplaindata(TokenSource* ts, Token* a, int* pai){	Rune*	s;	int	j;	int	starti;	int	c;	Token*	tok;	Rune	buf[BIGBUFSIZE];	s = nil;	j = 0;	starti = ts->i;	for(c = getchar(ts); c >= 0; c = getchar(ts)) {		if(c < ' ') {			if(isspace(c)) {				if(c == '\r') {					// ignore it unless no following '\n',					// in which case treat it like '\n'					c = getchar(ts);					if(c != '\n') {						if(c >= 0)							ungetchar(ts, c);						c = '\n';					}				}			}			else				c = 0;		}		if(c != 0) {			buf[j++] = c;			if(j == sizeof(buf)-1) {				s = buftostr(s, buf, j);				j = 0;			}		}		if(c == '\n')			break;	}	s = buftostr(s, buf, j);	if(s == nil)		return -1;	tok = &a[(*pai)++];	tok->tag = Data;	tok->text = s;	tok->attr = nil;	tok->starti = starti;	return Data;}// Return concatenation of s and buf[0:j]static Rune*buftostr(Rune* s, Rune* buf, int j){	buf[j] = 0;	if(s == nil)		s = _Strndup(buf, j);	else 		s = _Strdup2(s, buf);	return s;}// Gather data up to next start-of-tag or end-of-buffer.// Translate entity references (&amp;).// Ignore non-whitespace control characters and get rid of \r's.// If find non-empty token, fill in a[*pai], bump *pai, and return Data.// Otherwise return -1;static intgetdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai){	Rune*	s;	int	j;	int	c;	Token*	tok;	Rune	buf[BIGBUFSIZE];	s = nil;	j = 0;	c = firstc;	while(c >= 0) {		if(c == '&') {			c = ampersand(ts);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -