lex.c

来自「这是一个同样来自贝尔实验室的和UNIX有着渊源的操作系统, 其简洁的设计和实现易」· C语言代码 · 共 1,498 行 · 第 1/2 页
1,498 行
#include <u.h>#include <libc.h>#include <draw.h>#include <ctype.h>#include <html.h>#include "impl.h"typedef struct TokenSource TokenSource;struct TokenSource{	int			i;		// index of next byte to use	uchar*		data;		// all the data	int			edata;	// data[0:edata] is valid	int			chset;	// one of US_Ascii, etc.	int			mtype;	// TextHtml or TextPlain};enum {	EOF = -2,	EOB = -1};#define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))#define SMALLBUFSIZE 240#define BIGBUFSIZE 2000// HTML 4.0 tag names.// Keep sorted, and in correspondence with enum in iparse.h.Rune* tagnames[] = {	L" ",	L"!",	L"a", 	L"abbr",	L"acronym",	L"address",	L"applet", 	L"area",	L"b",	L"base",	L"basefont",	L"bdo",	L"big",	L"blink",	L"blockquote",	L"body",	L"bq",	L"br",	L"button",	L"caption",	L"center",	L"cite",	L"code",	L"col",	L"colgroup",	L"dd",	L"del",	L"dfn",	L"dir",	L"div",	L"dl",	L"dt",	L"em",	L"fieldset",	L"font",	L"form",	L"frame",	L"frameset",	L"h1",	L"h2",	L"h3",	L"h4",	L"h5",	L"h6",	L"head",	L"hr",	L"html",	L"i",	L"iframe",	L"img",	L"input",	L"ins",	L"isindex",	L"kbd",	L"label",	L"legend",	L"li",	L"link",	L"map",	L"menu",	L"meta",	L"nobr",	L"noframes",	L"noscript",	L"object",	L"ol",	L"optgroup",	L"option",	L"p",	L"param",	L"pre",	L"q",	L"s",	L"samp",	L"script",	L"select",	L"small",	L"span",	L"strike",	L"strong",	L"style",	L"sub",	L"sup",	L"table",	L"tbody",	L"td",	L"textarea",	L"tfoot",	L"th",	L"thead",	L"title",	L"tr",	L"tt",	L"u",	L"ul",	L"var"};// HTML 4.0 attribute names.// Keep sorted, and in correspondence with enum in impl.h.Rune* attrnames[] = {	L"abbr",	L"accept-charset",	L"access-key",	L"action",	L"align",	L"alink",	L"alt",	L"archive",	L"axis",	L"background",	L"bgcolor",	L"border",	L"cellpadding",	L"cellspacing",	L"char",	L"charoff",	L"charset",	L"checked",	L"cite",	L"class",	L"classid",	L"clear",	L"code",	L"codebase",	L"codetype",	L"color",	L"cols",	L"colspan",	L"compact",	L"content",	L"coords",	L"data",	L"datetime",	L"declare",	L"defer",	L"dir",	L"disabled",	L"enctype",	L"face",	L"for",	L"frame",	L"frameborder",	L"headers",	L"height",	L"href",	L"hreflang",	L"hspace",	L"http-equiv",	L"id",	L"ismap",	L"label",	L"lang",	L"link",	L"longdesc",	L"marginheight",	L"marginwidth",	L"maxlength",	L"media",	L"method",	L"multiple",	L"name",	L"nohref",	L"noresize",	L"noshade",	L"nowrap",	L"object",	L"onblur",	L"onchange",	L"onclick",	L"ondblclick",	L"onfocus",	L"onkeypress",	L"onkeyup",	L"onload",	L"onmousedown",	L"onmousemove",	L"onmouseout",	L"onmouseover",	L"onmouseup",	L"onreset",	L"onselect",	L"onsubmit",	L"onunload",	L"profile",	L"prompt",	L"readonly",	L"rel",	L"rev",	L"rows",	L"rowspan",	L"rules",	L"scheme",	L"scope",	L"scrolling",	L"selected",	L"shape",	L"size",	L"span",	L"src",	L"standby",	L"start",	L"style",	L"summary",	L"tabindex",	L"target",	L"text",	L"title",	L"type",	L"usemap",	L"valign",	L"value",	L"valuetype",	L"version",	L"vlink",	L"vspace",	L"width"};// Character entity to unicode character number map.// Keep sorted by name.StringInt	chartab[]= {	{L"AElig", 198},	{L"Aacute", 193},	{L"Acirc", 194},	{L"Agrave", 192},	{L"Alpha", 913},	{L"Aring", 197},	{L"Atilde", 195},	{L"Auml", 196},	{L"Beta", 914},	{L"Ccedil", 199},	{L"Chi", 935},	{L"Dagger", 8225},	{L"Delta", 916},	{L"ETH", 208},	{L"Eacute", 201},	{L"Ecirc", 202},	{L"Egrave", 200},	{L"Epsilon", 917},	{L"Eta", 919},	{L"Euml", 203},	{L"Gamma", 915},	{L"Iacute", 205},	{L"Icirc", 206},	{L"Igrave", 204},	{L"Iota", 921},	{L"Iuml", 207},	{L"Kappa", 922},	{L"Lambda", 923},	{L"Mu", 924},	{L"Ntilde", 209},	{L"Nu", 925},	{L"OElig", 338},	{L"Oacute", 211},	{L"Ocirc", 212},	{L"Ograve", 210},	{L"Omega", 937},	{L"Omicron", 927},	{L"Oslash", 216},	{L"Otilde", 213},	{L"Ouml", 214},	{L"Phi", 934},	{L"Pi", 928},	{L"Prime", 8243},	{L"Psi", 936},	{L"Rho", 929},	{L"Scaron", 352},	{L"Sigma", 931},	{L"THORN", 222},	{L"Tau", 932},	{L"Theta", 920},	{L"Uacute", 218},	{L"Ucirc", 219},	{L"Ugrave", 217},	{L"Upsilon", 933},	{L"Uuml", 220},	{L"Xi", 926},	{L"Yacute", 221},	{L"Yuml", 376},	{L"Zeta", 918},	{L"aacute", 225},	{L"acirc", 226},	{L"acute", 180},	{L"aelig", 230},	{L"agrave", 224},	{L"alefsym", 8501},	{L"alpha", 945},	{L"amp", 38},	{L"and", 8743},	{L"ang", 8736},	{L"aring", 229},	{L"asymp", 8776},	{L"atilde", 227},	{L"auml", 228},	{L"bdquo", 8222},	{L"beta", 946},	{L"brvbar", 166},	{L"bull", 8226},	{L"cap", 8745},	{L"ccedil", 231},	{L"cdots", 8943},	{L"cedil", 184},	{L"cent", 162},	{L"chi", 967},	{L"circ", 710},	{L"clubs", 9827},	{L"cong", 8773},	{L"copy", 169},	{L"crarr", 8629},	{L"cup", 8746},	{L"curren", 164},	{L"dArr", 8659},	{L"dagger", 8224},	{L"darr", 8595},	{L"ddots", 8945},	{L"deg", 176},	{L"delta", 948},	{L"diams", 9830},	{L"divide", 247},	{L"eacute", 233},	{L"ecirc", 234},	{L"egrave", 232},	{L"emdash", 8212},	/* non-standard but commonly used */	{L"empty", 8709},	{L"emsp", 8195},	{L"endash", 8211},	/* non-standard but commonly used */	{L"ensp", 8194},	{L"epsilon", 949},	{L"equiv", 8801},	{L"eta", 951},	{L"eth", 240},	{L"euml", 235},	{L"euro", 8364},	{L"exist", 8707},	{L"fnof", 402},	{L"forall", 8704},	{L"frac12", 189},	{L"frac14", 188},	{L"frac34", 190},	{L"frasl", 8260},	{L"gamma", 947},	{L"ge", 8805},	{L"gt", 62},	{L"hArr", 8660},	{L"harr", 8596},	{L"hearts", 9829},	{L"hellip", 8230},	{L"iacute", 237},	{L"icirc", 238},	{L"iexcl", 161},	{L"igrave", 236},	{L"image", 8465},	{L"infin", 8734},	{L"int", 8747},	{L"iota", 953},	{L"iquest", 191},	{L"isin", 8712},	{L"iuml", 239},	{L"kappa", 954},	{L"lArr", 8656},	{L"lambda", 955},	{L"lang", 9001},	{L"laquo", 171},	{L"larr", 8592},	{L"lceil", 8968},	{L"ldots", 8230},	{L"ldquo", 8220},	{L"le", 8804},	{L"lfloor", 8970},	{L"lowast", 8727},	{L"loz", 9674},	{L"lrm", 8206},	{L"lsaquo", 8249},	{L"lsquo", 8216},	{L"lt", 60},	{L"macr", 175},	{L"mdash", 8212},	{L"micro", 181},	{L"middot", 183},	{L"minus", 8722},	{L"mu", 956},	{L"nabla", 8711},	{L"nbsp", 160},	{L"ndash", 8211},	{L"ne", 8800},	{L"ni", 8715},	{L"not", 172},	{L"notin", 8713},	{L"nsub", 8836},	{L"ntilde", 241},	{L"nu", 957},	{L"oacute", 243},	{L"ocirc", 244},	{L"oelig", 339},	{L"ograve", 242},	{L"oline", 8254},	{L"omega", 969},	{L"omicron", 959},	{L"oplus", 8853},	{L"or", 8744},	{L"ordf", 170},	{L"ordm", 186},	{L"oslash", 248},	{L"otilde", 245},	{L"otimes", 8855},	{L"ouml", 246},	{L"para", 182},	{L"part", 8706},	{L"permil", 8240},	{L"perp", 8869},	{L"phi", 966},	{L"pi", 960},	{L"piv", 982},	{L"plusmn", 177},	{L"pound", 163},	{L"prime", 8242},	{L"prod", 8719},	{L"prop", 8733},	{L"psi", 968},	{L"quad", 8193},	{L"quot", 34},	{L"rArr", 8658},	{L"radic", 8730},	{L"rang", 9002},	{L"raquo", 187},	{L"rarr", 8594},	{L"rceil", 8969},	{L"rdquo", 8221},	{L"real", 8476},	{L"reg", 174},	{L"rfloor", 8971},	{L"rho", 961},	{L"rlm", 8207},	{L"rsaquo", 8250},	{L"rsquo", 8217},	{L"sbquo", 8218},	{L"scaron", 353},	{L"sdot", 8901},	{L"sect", 167},	{L"shy", 173},	{L"sigma", 963},	{L"sigmaf", 962},	{L"sim", 8764},	{L"sp", 8194},	{L"spades", 9824},	{L"sub", 8834},	{L"sube", 8838},	{L"sum", 8721},	{L"sup", 8835},	{L"sup1", 185},	{L"sup2", 178},	{L"sup3", 179},	{L"supe", 8839},	{L"szlig", 223},	{L"tau", 964},	{L"there4", 8756},	{L"theta", 952},	{L"thetasym", 977},	{L"thinsp", 8201},	{L"thorn", 254},	{L"tilde", 732},	{L"times", 215},	{L"trade", 8482},	{L"uArr", 8657},	{L"uacute", 250},	{L"uarr", 8593},	{L"ucirc", 251},	{L"ugrave", 249},	{L"uml", 168},	{L"upsih", 978},	{L"upsilon", 965},	{L"uuml", 252},	{L"varepsilon", 8712},	{L"varphi", 981},	{L"varpi", 982},	{L"varrho", 1009},	{L"vdots", 8942},	{L"vsigma", 962},	{L"vtheta", 977},	{L"weierp", 8472},	{L"xi", 958},	{L"yacute", 253},	{L"yen", 165},	{L"yuml", 255},	{L"zeta", 950},	{L"zwj", 8205},	{L"zwnj", 8204}};#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))// Characters Winstart..Winend are those that Windows// uses interpolated into the Latin1 set.// They aren't supposed to appear in HTML, but they do....enum {	Winstart = 127,	Winend = 159};static int	winchars[]= { 8226,	// 8226 is a bullet	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,	710, 8240, 352, 8249, 338, 8226, 8226, 8226,	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,	732, 8482, 353, 8250, 339, 8226, 8226, 376};static StringInt*	tagtable;		// initialized from tagnamesstatic StringInt*	attrtable;		// initialized from attrnamesstatic void		lexinit();static int		getplaindata(TokenSource* ts, Token* a, int* pai);static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);static Rune*	buftostr(Rune* s, Rune* buf, int j);static int		comment(TokenSource* ts);static int		findstr(TokenSource* ts, Rune* s);static int		ampersand(TokenSource* ts);static int		lowerc(int c);static int		getchar(TokenSource* ts);static void		ungetchar(TokenSource* ts, int c);static void		backup(TokenSource* ts, int savei);static void		freeinsidetoken(Token* t);static void		freeattrs(Attr* ahead);static Attr*	newattr(int attid, Rune* value, Attr* link);static int		Tconv(Fmt* f);int	dbglex = 0;static int lexinited = 0;static voidlexinit(void){	tagtable = _makestrinttab(tagnames, Numtags);	attrtable = _makestrinttab(attrnames, Numattrs);	fmtinstall('T', Tconv);	lexinited = 1;}static TokenSource*newtokensource(uchar* data, int edata, int chset, int mtype){	TokenSource*	ans;	assert(chset == US_Ascii || chset == ISO_8859_1 ||			chset == UTF_8 || chset == Unicode);	ans = (TokenSource*)emalloc(sizeof(TokenSource));	ans->i = 0;	ans->data = data;	ans->edata = edata;	ans->chset = chset;	ans->mtype = mtype;	return ans;}enum {	ToksChunk = 500};// Call this to get the tokens.//  The number of returned tokens is returned in *plen.Token*_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen){	TokenSource*	ts;	Token*		a;	int	alen;	int	ai;	int	starti;	int	c;	int	tag;	if(!lexinited)		lexinit();	ts = newtokensource(data, datalen, chset, mtype);	alen = ToksChunk;	a = (Token*)emalloc(alen * sizeof(Token));	ai = 0;	if(dbglex)		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);	if(ts->mtype == TextHtml) {		for(;;) {			if(ai == alen) {				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));				alen += ToksChunk;			}			starti = ts->i;			c = getchar(ts);			if(c < 0)				break;			if(c == '<') {				tag = gettag(ts, starti, a, &ai);				if(tag == Tscript || tag == Tstyle) {					// special rules for getting Data after....					starti = ts->i;					c = getchar(ts);					tag = getscriptdata(ts, c, starti, a, &ai, tag);				}			}			else				tag = getdata(ts, c, starti, a, &ai);			if(tag == -1)				break;			else if(dbglex > 1 && tag != Comment)				fprint(2, "lex: got token %T\n", &a[ai-1]);		}	}	else {		// plain text (non-html) tokens		for(;;) {			if(ai == alen) {				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));				alen += ToksChunk;			}			tag = getplaindata(ts, a, &ai);			if(tag == -1)				break;			if(dbglex > 1)				fprint(2, "lex: got token %T\n", &a[ai]);		}	}	free(ts);	if(dbglex)		fprint(2, "lex: returning %d tokens\n", ai);	*plen = ai;	if(ai == 0) 		return nil;	return a;}// For case where source isn't HTML.// Just make data tokens, one per line (or partial line,// at end of buffer), ignoring non-whitespace control// characters and dumping \r's.// If find non-empty token, fill in a[*pai], bump *pai, and return Data.// Otherwise return -1;static intgetplaindata(TokenSource* ts, Token* a, int* pai){	Rune*	s;	int	j;	int	starti;	int	c;	Token*	tok;	Rune	buf[BIGBUFSIZE];	s = nil;	j = 0;	starti = ts->i;	for(c = getchar(ts); c >= 0; c = getchar(ts)) {		if(c < ' ') {			if(isspace(c)) {				if(c == '\r') {					// ignore it unless no following '\n',					// in which case treat it like '\n'					c = getchar(ts);					if(c != '\n') {						if(c >= 0)							ungetchar(ts, c);						c = '\n';					}				}			}			else				c = 0;		}		if(c != 0) {			buf[j++] = c;			if(j == sizeof(buf)-1) {				s = buftostr(s, buf, j);				j = 0;			}		}		if(c == '\n')			break;	}	s = buftostr(s, buf, j);	if(s == nil)		return -1;	tok = &a[(*pai)++];	tok->tag = Data;	tok->text = s;	tok->attr = nil;	tok->starti = starti;	return Data;}// Return concatenation of s and buf[0:j]static Rune*buftostr(Rune* s, Rune* buf, int j){	buf[j] = 0;	if(s == nil)		s = _Strndup(buf, j);	else 		s = _Strdup2(s, buf);	return s;}// Gather data up to next start-of-tag or end-of-buffer.// Translate entity references (&amp;).// Ignore non-whitespace control characters and get rid of \r's.// If find non-empty token, fill in a[*pai], bump *pai, and return Data.// Otherwise return -1;static intgetdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai){	Rune*	s;	int	j;	int	c;	Token*	tok;	Rune	buf[BIGBUFSIZE];	s = nil;	j = 0;	c = firstc;	while(c >= 0) {		if(c == '&') {			c = ampersand(ts);
lex.c - 源码说明

本页面展示了「这是一个同样来自贝尔实验室的和UNIX有着渊源的操作系统, 其简洁的设计和实现易于我们学习和理解」中的 lex.c 源码文件，采用 C语言编程语言编写，共 1,498 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与UNIX相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?