📄 lex.c
字号:
if(c < 0) break; } else if(c < ' ') { if(isspace(c)) { if(c == '\r') { // ignore it unless no following '\n', // in which case treat it like '\n' c = getchar(ts); if(c != '\n') { if(c >= 0) ungetchar(ts, c); c = '\n'; } } } else { if(warn) fprint(2, "warning: non-whitespace control character %d ignored\n", c); c = 0; } } else if(c == '<') { ungetchar(ts, c); break; } if(c != 0) { buf[j++] = c; if(j == BIGBUFSIZE-1) { s = buftostr(s, buf, j); j = 0; } } c = getchar(ts); } s = buftostr(s, buf, j); if(s == nil) return -1; tok = &a[(*pai)++]; tok->tag = Data; tok->text = s; tok->attr = nil; tok->starti = starti; return Data;}// The rules for lexing scripts are different (ugh).// Gather up everything until see an "</" tagnames[tok] ">"static intgetscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag){ Rune* s; int j; int tstarti; int savei; int c; int tag; int done; Token* tok; Rune buf[BIGBUFSIZE]; s = nil; j = 0; tstarti = starti; c = firstc; done = 0; while(c >= 0) { if(c == '<') { // other browsers ignore stuff to end of line after <! savei = ts->i; c = getchar(ts); if(c == '!') {// while(c >= 0 && c != '\n' && c != '\r')// c = getchar(ts); if(comment(ts) == -1) break; if(c == '\r') c = getchar(ts); if(c == '\n') c = getchar(ts); } else if(c >= 0) { backup(ts, savei); tag = gettag(ts, tstarti, a, pai); if(tag == -1) break; if(tag != Comment) (*pai)--; backup(ts, tstarti); if(tag == findtag + RBRA) { done = 1; break; } // here tag was not the one we were looking for, so take as regular data c = getchar(ts); } } if(c < 0) break; if(c != 0) { buf[j++] = c; if(j == BIGBUFSIZE-1) { s = buftostr(s, buf, j); j = 0; } } tstarti = ts->i; c = getchar(ts); } if(done || ts->i == ts->edata) { s = buftostr(s, buf, j); tok = &a[(*pai)++]; tok->tag = Data; tok->text = s; tok->attr = nil; tok->starti = starti; return Data; } backup(ts, starti); return -1;}// We've just seen a '<'. Gather up stuff to closing '>' (if buffer// ends before then, return -1).// If it's a tag, look up the name, gather the attributes, and return// the appropriate token.// Else it's either just plain data or some kind of ignorable stuff:// return Data or Comment as appropriate.// If it's not a Comment, put it in a[*pai] and bump *pai.static intgettag(TokenSource* ts, int starti, Token* a, int* pai){ int rbra; int ans; Attr* al; int nexti; int c; int ti; int afnd; int attid; int quote; Rune* val; int nv; int i; int tag; Token* tok; Rune buf[BIGBUFSIZE]; rbra = 0; nexti = ts->i; tok = &a[*pai]; tok->tag = Notfound; tok->text = nil; tok->attr = nil; tok->starti = starti; c = getchar(ts); if(c == '/') { rbra = RBRA; c = getchar(ts); } if(c < 0) goto eob_done; if(c >= 256 || !isalpha(c)) { // not a tag if(c == '!') { ans = comment(ts); if(ans != -1) return ans; goto eob_done; } else { backup(ts, nexti); tok->tag = Data; tok->text = _Strdup(L"<"); (*pai)++; return Data; } } // c starts a tagname buf[0] = c; i = 1; while(1) { c = getchar(ts); if(c < 0) goto eob_done; if(!ISNAMCHAR(c)) break; // if name is bigger than buf it won't be found anyway... if(i < BIGBUFSIZE) buf[i++] = c; } if(_lookup(tagtable, Numtags, buf, i, &tag)) tok->tag = tag + rbra; else tok->text = _Strndup(buf, i); // for warning print, in build // attribute gathering loop al = nil; while(1) { // look for "ws name" or "ws name ws = ws val" (ws=whitespace) // skip whitespaceattrloop_continue: while(c < 256 && isspace(c)) { c = getchar(ts); if(c < 0) goto eob_done; } if(c == '>') goto attrloop_done; if(c == '<') { if(warn) fprint(2, "warning: unclosed tag\n"); ungetchar(ts, c); goto attrloop_done; } if(c >= 256 || !isalpha(c)) { if(warn) fprint(2, "warning: expected attribute name\n"); // skipt to next attribute name while(1) { c = getchar(ts); if(c < 0) goto eob_done; if(c < 256 && isalpha(c)) goto attrloop_continue; if(c == '<') { if(warn) fprint(2, "warning: unclosed tag\n"); ungetchar(ts, 60); goto attrloop_done; } if(c == '>') goto attrloop_done; } } // gather attribute name buf[0] = c; i = 1; while(1) { c = getchar(ts); if(c < 0) goto eob_done; if(!ISNAMCHAR(c)) break; if(i < BIGBUFSIZE-1) buf[i++] = c; } afnd = _lookup(attrtable, Numattrs, buf, i, &attid); if(warn && !afnd) { buf[i] = 0; fprint(2, "warning: unknown attribute name %S\n", buf); } // skip whitespace while(c < 256 && isspace(c)) { c = getchar(ts); if(c < 0) goto eob_done; } if(c != '=') { if(afnd) al = newattr(attid, nil, al); goto attrloop_continue; } //# c is '=' here; skip whitespace while(1) { c = getchar(ts); if(c < 0) goto eob_done; if(c >= 256 || !isspace(c)) break; } quote = 0; if(c == '\'' || c == '"') { quote = c; c = getchar(ts); if(c < 0) goto eob_done; } val = nil; nv = 0; while(1) {valloop_continue: if(c < 0) goto eob_done; if(c == '>') { if(quote) { // c might be part of string (though not good style) // but if line ends before close quote, assume // there was an unmatched quote ti = ts->i; while(1) { c = getchar(ts); if(c < 0) goto eob_done; if(c == quote) { backup(ts, ti); buf[nv++] = '>'; if(nv == BIGBUFSIZE-1) { val = buftostr(val, buf, nv); nv = 0; } c = getchar(ts); goto valloop_continue; } if(c == '\n') { if(warn) fprint(2, "warning: apparent unmatched quote\n"); backup(ts, ti); c = '>'; goto valloop_done; } } } else goto valloop_done; } if(quote) { if(c == quote) { c = getchar(ts); if(c < 0) goto eob_done; goto valloop_done; } if(c == '\r') { c = getchar(ts); goto valloop_continue; } if(c == '\t' || c == '\n') c = ' '; } else { if(c < 256 && isspace(c)) goto valloop_done; } if(c == '&') { c = ampersand(ts); if(c == -1) goto eob_done; } buf[nv++] = c; if(nv == BIGBUFSIZE-1) { val = buftostr(val, buf, nv); nv = 0; } c = getchar(ts); }valloop_done: if(afnd) { val = buftostr(val, buf, nv); al = newattr(attid, val, al); } }attrloop_done: tok->attr = al; (*pai)++; return tok->tag;eob_done: if(warn) fprint(2, "warning: incomplete tag at end of page\n"); backup(ts, nexti); tok->tag = Data; tok->text = _Strdup(L"<"); return Data;}// We've just read a '<!' at position starti,// so this may be a comment or other ignored section, or it may// be just a literal string if there is no close before end of file// (other browsers do that).// The accepted practice seems to be (note: contrary to SGML spec!):// If see <!--, look for --> to close, or if none, > to close.// If see <!(not --), look for > to close.// If no close before end of file, leave original characters in as literal data.//// If we see ignorable stuff, return Comment.// Else return nil (caller should back up and try again when more data arrives,// unless at end of file, in which case caller should just make '<' a data token).static intcomment(TokenSource* ts){ int nexti; int havecomment; int c; nexti = ts->i; havecomment = 0; c = getchar(ts); if(c == '-') { c = getchar(ts); if(c == '-') { if(findstr(ts, L"-->")) havecomment = 1; else backup(ts, nexti); } } if(!havecomment) { if(c == '>') havecomment = 1; else if(c >= 0) { if(findstr(ts, L">")) havecomment = 1; } } if(havecomment) return Comment; return -1;}// Look for string s in token source.// If found, return 1, with buffer at next char after s,// else return 0 (caller should back up).static intfindstr(TokenSource* ts, Rune* s){ int c0; int n; int nexti; int i; int c; c0 = s[0]; n = runestrlen(s); while(1) { c = getchar(ts); if(c < 0) break; if(c == c0) { if(n == 1) return 1; nexti = ts->i; for(i = 1; i < n; i++) { c = getchar(ts); if(c < 0) goto mainloop_done; if(c != s[i]) break; } if(i == n) return 1; backup(ts, nexti); } }mainloop_done: return 0;}// We've just read an '&'; look for an entity reference// name, and if found, return translated char.// if there is a complete entity name but it isn't known,// back up to just past the '&' and return '&'.// If the entity can't be completed in the current buffer, back up// to the '&' and return -1.static intampersand(TokenSource* ts){ int savei; int c; int fnd; int ans; int v; int k; Rune buf[SMALLBUFSIZE]; savei = ts->i; c = getchar(ts); fnd = 0; ans = -1; if(c == '#') { c = getchar(ts); v = 0; if(c == 'X' || c == 'x') for(c = getchar(ts); c < 256; c = getchar(ts)) if(c >= '0' && c <= '9') v = v*16+c-'0'; else if(c >= 'A' && c<= 'F') v = v*16+c-'A'+10; else if(c >= 'a' && c <= 'f') v = v*16+c-'a'+10; else break; else while(c >= 0) { if(!(c < 256 && isdigit(c))) break; v = v*10 + c - 48; c = getchar(ts); } if(c >= 0) { if(!(c == ';' || c == '\n' || c == '\r')) ungetchar(ts, c); c = v; if(c == 160) c = 160; if(c >= Winstart && c <= Winend) { c = winchars[c - Winstart]; } ans = c; fnd = 1; } } else if(c < 256 && isalpha(c)) { buf[0] = c; k = 1; while(1) { c = getchar(ts); if(c < 0) break; if(c < 256 && (isalpha(c) || isdigit(c))) { if(k < SMALLBUFSIZE-1) buf[k++] = c; } else { if(!(c == ';' || c == '\n' || c == '\r')) ungetchar(ts, c); break; } } if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c))) fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); } if(!fnd) { backup(ts, savei); ans = '&'; } return ans;}// Get next char, obeying ts.chset.// Returns -1 if no complete character left before current end of data.static intgetchar(TokenSource* ts){ uchar* buf; int c; int n; int ok; Rune r; if(ts->i >= ts->edata) return -1; buf = ts->data; c = buf[ts->i]; switch(ts->chset) { case ISO_8859_1: if(c >= Winstart && c <= Winend) c = winchars[c - Winstart]; ts->i++; break; case US_Ascii: if(c > 127) { if(warn) fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c); } ts->i++; break; case UTF_8: ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i); n = chartorune(&r, (char*)(buf+ts->i)); if(ok) { if(warn && c == 0x80) fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]); ts->i += n; c = r; } else { // not enough bytes in buf to complete utf-8 char ts->i = ts->edata; // mark "all used" c = -1; } break; case Unicode: if(ts->i < ts->edata - 1) { //standards say most-significant byte first c = (c << 8)|(buf[ts->i + 1]); ts->i += 2; } else { ts->i = ts->edata; // mark "all used" c = -1; } break; } return c;}// Assuming c was the last character returned by getchar, set// things up so that next getchar will get that same character// followed by the current 'next character', etc.static voidungetchar(TokenSource* ts, int c){ int n; Rune r; char a[UTFmax]; n = 1; switch(ts->chset) { case UTF_8: if(c >= 128) { r = c; n = runetochar(a, &r); } break; case Unicode: n = 2; break; } ts->i -= n;}// Restore ts so that it is at the state where the index was savei.static voidbackup(TokenSource* ts, int savei){ if(dbglex) fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei); ts->i = savei;}// Look for value associated with attribute attid in token t.// If there is one, return 1 and put the value in *pans,// else return 0.// If xfer is true, transfer ownership of the string to the caller// (nil it out here); otherwise, caller must duplicate the answer// if it needs to save it.// OK to have pans==0, in which case this is just looking// to see if token is present.int_tokaval(Token* t, int attid, Rune** pans, int xfer){ Attr* attr; attr = t->attr; while(attr != nil) { if(attr->attid == attid) { if(pans != nil) *pans = attr->value; if(xfer) attr->value = nil; return 1; } attr = attr->next; } if(pans != nil) *pans = nil; return 0;}static intTconv(Fmt *f){ Token* t; int i; int tag; char* srbra; Rune* aname; Rune* tname; Attr* a; char buf[BIGBUFSIZE]; t = va_arg(f->args, Token*); if(t == nil) sprint(buf, "<null>"); else { i = 0; if(dbglex > 1) i = snprint(buf, sizeof(buf), "[%d]", t->starti); tag = t->tag; if(tag == Data) { i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text); } else { srbra = ""; if(tag >= RBRA) { tag -= RBRA; srbra = "/"; } tname = tagnames[tag]; if(tag == Notfound) tname = L"?"; i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname); for(a = t->attr; a != nil; a = a->next) { aname = attrnames[a->attid]; i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname); if(a->value != nil) i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value); } i += snprint(buf+i, sizeof(buf)-i-1, ">"); } buf[i] = 0; } return fmtstrcpy(f, buf);}// Attrs own their constituent strings, but build may eventually// transfer some values to its items and nil them out in the Attr.static Attr*newattr(int attid, Rune* value, Attr* link){ Attr* ans; ans = (Attr*)emalloc(sizeof(Attr)); ans->attid = attid; ans->value = value; ans->next = link; return ans;}// Free list of Attrs linked through next fieldstatic voidfreeattrs(Attr* ahead){ Attr* a; Attr* nexta; a = ahead; while(a != nil) { nexta = a->next; free(a->value); free(a); a = nexta; }}// Free array of Tokens.// Allocated space might have room for more than n tokens,// but only n of them are initialized.// If caller has transferred ownership of constitutent strings// or attributes, it must have nil'd out the pointers in the Tokens.void_freetokens(Token* tarray, int n){ int i; Token* t; if(tarray == nil) return; for(i = 0; i < n; i++) { t = &tarray[i]; free(t->text); freeattrs(t->attr); } free(tarray);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -