📄 lex.c
字号:
#include <u.h>#include <libc.h>#include <draw.h>#include <ctype.h>#include <html.h>#include "impl.h"typedef struct TokenSource TokenSource;struct TokenSource{ int i; // index of next byte to use uchar* data; // all the data int edata; // data[0:edata] is valid int chset; // one of US_Ascii, etc. int mtype; // TextHtml or TextPlain};enum { EOF = -2, EOB = -1};#define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))#define SMALLBUFSIZE 240#define BIGBUFSIZE 2000// HTML 4.0 tag names.// Keep sorted, and in correspondence with enum in iparse.h.Rune* tagnames[] = { L" ", L"!", L"a", L"abbr", L"acronym", L"address", L"applet", L"area", L"b", L"base", L"basefont", L"bdo", L"big", L"blink", L"blockquote", L"body", L"bq", L"br", L"button", L"caption", L"center", L"cite", L"code", L"col", L"colgroup", L"dd", L"del", L"dfn", L"dir", L"div", L"dl", L"dt", L"em", L"fieldset", L"font", L"form", L"frame", L"frameset", L"h1", L"h2", L"h3", L"h4", L"h5", L"h6", L"head", L"hr", L"html", L"i", L"iframe", L"img", L"input", L"ins", L"isindex", L"kbd", L"label", L"legend", L"li", L"link", L"map", L"menu", L"meta", L"nobr", L"noframes", L"noscript", L"object", L"ol", L"optgroup", L"option", L"p", L"param", L"pre", L"q", L"s", L"samp", L"script", L"select", L"small", L"span", L"strike", L"strong", L"style", L"sub", L"sup", L"table", L"tbody", L"td", L"textarea", L"tfoot", L"th", L"thead", L"title", L"tr", L"tt", L"u", L"ul", L"var"};// HTML 4.0 attribute names.// Keep sorted, and in correspondence with enum in impl.h.Rune* attrnames[] = { L"abbr", L"accept-charset", L"access-key", L"action", L"align", L"alink", L"alt", L"archive", L"axis", L"background", L"bgcolor", L"border", L"cellpadding", L"cellspacing", L"char", L"charoff", L"charset", L"checked", L"cite", L"class", L"classid", L"clear", L"code", L"codebase", L"codetype", L"color", L"cols", L"colspan", L"compact", L"content", L"coords", L"data", L"datetime", L"declare", L"defer", L"dir", L"disabled", L"enctype", L"face", L"for", L"frame", L"frameborder", L"headers", L"height", L"href", L"hreflang", L"hspace", L"http-equiv", L"id", L"ismap", L"label", L"lang", L"link", L"longdesc", L"marginheight", L"marginwidth", L"maxlength", L"media", L"method", L"multiple", L"name", L"nohref", L"noresize", L"noshade", L"nowrap", L"object", L"onblur", L"onchange", L"onclick", L"ondblclick", L"onfocus", L"onkeypress", L"onkeyup", L"onload", L"onmousedown", L"onmousemove", L"onmouseout", L"onmouseover", L"onmouseup", L"onreset", L"onselect", L"onsubmit", L"onunload", L"profile", L"prompt", L"readonly", L"rel", L"rev", L"rows", L"rowspan", L"rules", L"scheme", L"scope", L"scrolling", L"selected", L"shape", L"size", L"span", L"src", L"standby", L"start", L"style", L"summary", L"tabindex", L"target", L"text", L"title", L"type", L"usemap", L"valign", L"value", L"valuetype", L"version", L"vlink", L"vspace", L"width"};// Character entity to unicode character number map.// Keep sorted by name.StringInt chartab[]= { {L"AElig", 198}, {L"Aacute", 193}, {L"Acirc", 194}, {L"Agrave", 192}, {L"Alpha", 913}, {L"Aring", 197}, {L"Atilde", 195}, {L"Auml", 196}, {L"Beta", 914}, {L"Ccedil", 199}, {L"Chi", 935}, {L"Dagger", 8225}, {L"Delta", 916}, {L"ETH", 208}, {L"Eacute", 201}, {L"Ecirc", 202}, {L"Egrave", 200}, {L"Epsilon", 917}, {L"Eta", 919}, {L"Euml", 203}, {L"Gamma", 915}, {L"Iacute", 205}, {L"Icirc", 206}, {L"Igrave", 204}, {L"Iota", 921}, {L"Iuml", 207}, {L"Kappa", 922}, {L"Lambda", 923}, {L"Mu", 924}, {L"Ntilde", 209}, {L"Nu", 925}, {L"OElig", 338}, {L"Oacute", 211}, {L"Ocirc", 212}, {L"Ograve", 210}, {L"Omega", 937}, {L"Omicron", 927}, {L"Oslash", 216}, {L"Otilde", 213}, {L"Ouml", 214}, {L"Phi", 934}, {L"Pi", 928}, {L"Prime", 8243}, {L"Psi", 936}, {L"Rho", 929}, {L"Scaron", 352}, {L"Sigma", 931}, {L"THORN", 222}, {L"Tau", 932}, {L"Theta", 920}, {L"Uacute", 218}, {L"Ucirc", 219}, {L"Ugrave", 217}, {L"Upsilon", 933}, {L"Uuml", 220}, {L"Xi", 926}, {L"Yacute", 221}, {L"Yuml", 376}, {L"Zeta", 918}, {L"aacute", 225}, {L"acirc", 226}, {L"acute", 180}, {L"aelig", 230}, {L"agrave", 224}, {L"alefsym", 8501}, {L"alpha", 945}, {L"amp", 38}, {L"and", 8743}, {L"ang", 8736}, {L"aring", 229}, {L"asymp", 8776}, {L"atilde", 227}, {L"auml", 228}, {L"bdquo", 8222}, {L"beta", 946}, {L"brvbar", 166}, {L"bull", 8226}, {L"cap", 8745}, {L"ccedil", 231}, {L"cdots", 8943}, {L"cedil", 184}, {L"cent", 162}, {L"chi", 967}, {L"circ", 710}, {L"clubs", 9827}, {L"cong", 8773}, {L"copy", 169}, {L"crarr", 8629}, {L"cup", 8746}, {L"curren", 164}, {L"dArr", 8659}, {L"dagger", 8224}, {L"darr", 8595}, {L"ddots", 8945}, {L"deg", 176}, {L"delta", 948}, {L"diams", 9830}, {L"divide", 247}, {L"eacute", 233}, {L"ecirc", 234}, {L"egrave", 232}, {L"emdash", 8212}, /* non-standard but commonly used */ {L"empty", 8709}, {L"emsp", 8195}, {L"endash", 8211}, /* non-standard but commonly used */ {L"ensp", 8194}, {L"epsilon", 949}, {L"equiv", 8801}, {L"eta", 951}, {L"eth", 240}, {L"euml", 235}, {L"euro", 8364}, {L"exist", 8707}, {L"fnof", 402}, {L"forall", 8704}, {L"frac12", 189}, {L"frac14", 188}, {L"frac34", 190}, {L"frasl", 8260}, {L"gamma", 947}, {L"ge", 8805}, {L"gt", 62}, {L"hArr", 8660}, {L"harr", 8596}, {L"hearts", 9829}, {L"hellip", 8230}, {L"iacute", 237}, {L"icirc", 238}, {L"iexcl", 161}, {L"igrave", 236}, {L"image", 8465}, {L"infin", 8734}, {L"int", 8747}, {L"iota", 953}, {L"iquest", 191}, {L"isin", 8712}, {L"iuml", 239}, {L"kappa", 954}, {L"lArr", 8656}, {L"lambda", 955}, {L"lang", 9001}, {L"laquo", 171}, {L"larr", 8592}, {L"lceil", 8968}, {L"ldots", 8230}, {L"ldquo", 8220}, {L"le", 8804}, {L"lfloor", 8970}, {L"lowast", 8727}, {L"loz", 9674}, {L"lrm", 8206}, {L"lsaquo", 8249}, {L"lsquo", 8216}, {L"lt", 60}, {L"macr", 175}, {L"mdash", 8212}, {L"micro", 181}, {L"middot", 183}, {L"minus", 8722}, {L"mu", 956}, {L"nabla", 8711}, {L"nbsp", 160}, {L"ndash", 8211}, {L"ne", 8800}, {L"ni", 8715}, {L"not", 172}, {L"notin", 8713}, {L"nsub", 8836}, {L"ntilde", 241}, {L"nu", 957}, {L"oacute", 243}, {L"ocirc", 244}, {L"oelig", 339}, {L"ograve", 242}, {L"oline", 8254}, {L"omega", 969}, {L"omicron", 959}, {L"oplus", 8853}, {L"or", 8744}, {L"ordf", 170}, {L"ordm", 186}, {L"oslash", 248}, {L"otilde", 245}, {L"otimes", 8855}, {L"ouml", 246}, {L"para", 182}, {L"part", 8706}, {L"permil", 8240}, {L"perp", 8869}, {L"phi", 966}, {L"pi", 960}, {L"piv", 982}, {L"plusmn", 177}, {L"pound", 163}, {L"prime", 8242}, {L"prod", 8719}, {L"prop", 8733}, {L"psi", 968}, {L"quad", 8193}, {L"quot", 34}, {L"rArr", 8658}, {L"radic", 8730}, {L"rang", 9002}, {L"raquo", 187}, {L"rarr", 8594}, {L"rceil", 8969}, {L"rdquo", 8221}, {L"real", 8476}, {L"reg", 174}, {L"rfloor", 8971}, {L"rho", 961}, {L"rlm", 8207}, {L"rsaquo", 8250}, {L"rsquo", 8217}, {L"sbquo", 8218}, {L"scaron", 353}, {L"sdot", 8901}, {L"sect", 167}, {L"shy", 173}, {L"sigma", 963}, {L"sigmaf", 962}, {L"sim", 8764}, {L"sp", 8194}, {L"spades", 9824}, {L"sub", 8834}, {L"sube", 8838}, {L"sum", 8721}, {L"sup", 8835}, {L"sup1", 185}, {L"sup2", 178}, {L"sup3", 179}, {L"supe", 8839}, {L"szlig", 223}, {L"tau", 964}, {L"there4", 8756}, {L"theta", 952}, {L"thetasym", 977}, {L"thinsp", 8201}, {L"thorn", 254}, {L"tilde", 732}, {L"times", 215}, {L"trade", 8482}, {L"uArr", 8657}, {L"uacute", 250}, {L"uarr", 8593}, {L"ucirc", 251}, {L"ugrave", 249}, {L"uml", 168}, {L"upsih", 978}, {L"upsilon", 965}, {L"uuml", 252}, {L"varepsilon", 8712}, {L"varphi", 981}, {L"varpi", 982}, {L"varrho", 1009}, {L"vdots", 8942}, {L"vsigma", 962}, {L"vtheta", 977}, {L"weierp", 8472}, {L"xi", 958}, {L"yacute", 253}, {L"yen", 165}, {L"yuml", 255}, {L"zeta", 950}, {L"zwj", 8205}, {L"zwnj", 8204}};#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))// Characters Winstart..Winend are those that Windows// uses interpolated into the Latin1 set.// They aren't supposed to appear in HTML, but they do....enum { Winstart = 127, Winend = 159};static int winchars[]= { 8226, // 8226 is a bullet 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 8226, 8226, 8226, 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 8226, 8226, 376};static StringInt* tagtable; // initialized from tagnamesstatic StringInt* attrtable; // initialized from attrnamesstatic void lexinit();static int getplaindata(TokenSource* ts, Token* a, int* pai);static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);static int gettag(TokenSource* ts, int starti, Token* a, int* pai);static Rune* buftostr(Rune* s, Rune* buf, int j);static int comment(TokenSource* ts);static int findstr(TokenSource* ts, Rune* s);static int ampersand(TokenSource* ts);static int lowerc(int c);static int getchar(TokenSource* ts);static void ungetchar(TokenSource* ts, int c);static void backup(TokenSource* ts, int savei);static void freeinsidetoken(Token* t);static void freeattrs(Attr* ahead);static Attr* newattr(int attid, Rune* value, Attr* link);static int Tconv(Fmt* f);int dbglex = 0;static int lexinited = 0;static voidlexinit(void){ tagtable = _makestrinttab(tagnames, Numtags); attrtable = _makestrinttab(attrnames, Numattrs); fmtinstall('T', Tconv); lexinited = 1;}static TokenSource*newtokensource(uchar* data, int edata, int chset, int mtype){ TokenSource* ans; assert(chset == US_Ascii || chset == ISO_8859_1 || chset == UTF_8 || chset == Unicode); ans = (TokenSource*)emalloc(sizeof(TokenSource)); ans->i = 0; ans->data = data; ans->edata = edata; ans->chset = chset; ans->mtype = mtype; return ans;}enum { ToksChunk = 500};// Call this to get the tokens.// The number of returned tokens is returned in *plen.Token*_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen){ TokenSource* ts; Token* a; int alen; int ai; int starti; int c; int tag; if(!lexinited) lexinit(); ts = newtokensource(data, datalen, chset, mtype); alen = ToksChunk; a = (Token*)emalloc(alen * sizeof(Token)); ai = 0; if(dbglex) fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata); if(ts->mtype == TextHtml) { for(;;) { if(ai == alen) { a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); alen += ToksChunk; } starti = ts->i; c = getchar(ts); if(c < 0) break; if(c == '<') { tag = gettag(ts, starti, a, &ai); if(tag == Tscript || tag == Tstyle) { // special rules for getting Data after.... starti = ts->i; c = getchar(ts); tag = getscriptdata(ts, c, starti, a, &ai, tag); } } else tag = getdata(ts, c, starti, a, &ai); if(tag == -1) break; else if(dbglex > 1 && tag != Comment) fprint(2, "lex: got token %T\n", &a[ai-1]); } } else { // plain text (non-html) tokens for(;;) { if(ai == alen) { a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); alen += ToksChunk; } tag = getplaindata(ts, a, &ai); if(tag == -1) break; if(dbglex > 1) fprint(2, "lex: got token %T\n", &a[ai]); } } free(ts); if(dbglex) fprint(2, "lex: returning %d tokens\n", ai); *plen = ai; if(ai == 0) return nil; return a;}// For case where source isn't HTML.// Just make data tokens, one per line (or partial line,// at end of buffer), ignoring non-whitespace control// characters and dumping \r's.// If find non-empty token, fill in a[*pai], bump *pai, and return Data.// Otherwise return -1;static intgetplaindata(TokenSource* ts, Token* a, int* pai){ Rune* s; int j; int starti; int c; Token* tok; Rune buf[BIGBUFSIZE]; s = nil; j = 0; starti = ts->i; for(c = getchar(ts); c >= 0; c = getchar(ts)) { if(c < ' ') { if(isspace(c)) { if(c == '\r') { // ignore it unless no following '\n', // in which case treat it like '\n' c = getchar(ts); if(c != '\n') { if(c >= 0) ungetchar(ts, c); c = '\n'; } } } else c = 0; } if(c != 0) { buf[j++] = c; if(j == sizeof(buf)-1) { s = buftostr(s, buf, j); j = 0; } } if(c == '\n') break; } s = buftostr(s, buf, j); if(s == nil) return -1; tok = &a[(*pai)++]; tok->tag = Data; tok->text = s; tok->attr = nil; tok->starti = starti; return Data;}// Return concatenation of s and buf[0:j]static Rune*buftostr(Rune* s, Rune* buf, int j){ buf[j] = 0; if(s == nil) s = _Strndup(buf, j); else s = _Strdup2(s, buf); return s;}// Gather data up to next start-of-tag or end-of-buffer.// Translate entity references (&).// Ignore non-whitespace control characters and get rid of \r's.// If find non-empty token, fill in a[*pai], bump *pai, and return Data.// Otherwise return -1;static intgetdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai){ Rune* s; int j; int c; Token* tok; Rune buf[BIGBUFSIZE]; s = nil; j = 0; c = firstc; while(c >= 0) { if(c == '&') { c = ampersand(ts);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -