📄 regcomp.c
字号:
ret = regnode(BRANCH); chain = NULL; while (regparse < regxend && *regparse != '|' && *regparse != ')') { latest = regpiece(&flags); if (latest == NULL) return(NULL); *flagp |= flags&HASWIDTH; if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else regtail(chain, latest); chain = latest; } if (chain == NULL) /* Loop ran zero times. */ (void) regnode(NOTHING); return(ret);}/* - regpiece - something followed by possible [*+?] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */static char *regpiece(flagp)int *flagp;{ register char *ret; register char op; register char *next; int flags; char *origparse = regparse; int orignpar = regnpar; char *max; int iter; char ch; ret = regatom(&flags); if (ret == NULL) return(NULL); op = *regparse; /* Here's a total kludge: if after the atom there's a {\d+,?\d*} * then we decrement the first number by one and reset our * parsing back to the beginning of the same atom. If the first number * is down to 0, decrement the second number instead and fake up * a ? after it. Given the way this compiler doesn't keep track * of offsets on the first pass, this is the only way to replicate * a piece of code. Sigh. */ if (op == '{' && regcurly(regparse)) { next = regparse + 1; max = Nullch; while (isDIGIT(*next) || *next == ',') { if (*next == ',') { if (max) break; else max = next; } next++; } if (*next == '}') { /* got one */ if (!max) max = next; regparse++; iter = atoi(regparse); if (flags&SIMPLE) { /* we can do it right after all */ int tmp; reginsert(CURLY, ret); if (iter > 0) *flagp = (WORST|HASWIDTH); if (*max == ',') max++; else max = regparse; tmp = atoi(max); if (!tmp && *max != '0') tmp = 32767; /* meaning "infinity" */ if (tmp && tmp < iter) fatal("Can't do {n,m} with n > m"); if (regcode != ®dummy) {#ifdef REGALIGN *(unsigned short *)(ret+3) = iter; *(unsigned short *)(ret+5) = tmp;#else ret[3] = iter >> 8; ret[4] = iter & 0377; ret[5] = tmp >> 8; ret[6] = tmp & 0377;#endif } regparse = next; goto nest_check; } regsawbracket++; /* remember we clobbered exp */ if (iter > 0) { ch = *max; sprintf(regparse,"%.*d", max-regparse, iter - 1); *max = ch; if (*max == ',' && max[1] != '}') { if (atoi(max+1) <= 0) fatal("Can't do {n,m} with n > m"); ch = *next; sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1); *next = ch; } if (iter != 1 || *max == ',') { regparse = origparse; /* back up input pointer */ regnpar = orignpar; /* don't make more parens */ } else { regparse = next; goto nest_check; } *flagp = flags; return ret; } if (*max == ',') { max++; iter = atoi(max); if (max == next) { /* any number more? */ regparse = next; op = '*'; /* fake up one with a star */ } else if (iter > 0) { op = '?'; /* fake up optional atom */ ch = *next; sprintf(max,"%.*d", next-max, iter - 1); *next = ch; if (iter == 1) regparse = next; else { regparse = origparse - 1; /* offset ++ below */ regnpar = orignpar; } } else fatal("Can't do {n,0}"); } else fatal("Can't do {0}"); } } if (!ISMULT1(op)) { *flagp = flags; return(ret); } if (!(flags&HASWIDTH) && op != '?') FAIL("regexp *+ operand could be empty"); *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); if (op == '*' && (flags&SIMPLE)) reginsert(STAR, ret); else if (op == '*') { /* Emit x* as (x&|), where & means "self". */ reginsert(BRANCH, ret); /* Either x */ regoptail(ret, regnode(BACK)); /* and loop */ regoptail(ret, ret); /* back */ regtail(ret, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if (op == '+' && (flags&SIMPLE)) reginsert(PLUS, ret); else if (op == '+') { /* Emit x+ as x(&|), where & means "self". */ next = regnode(BRANCH); /* Either */ regtail(ret, next); regtail(regnode(BACK), ret); /* loop back */ regtail(next, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if (op == '?') { /* Emit x? as (x|) */ reginsert(BRANCH, ret); /* Either x */ regtail(ret, regnode(BRANCH)); /* or */ next = regnode(NOTHING); /* null. */ regtail(ret, next); regoptail(ret, next); } nest_check: regparse++; if (ISMULT2(regparse)) FAIL("nested *?+ in regexp"); return(ret);}/* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. * * [Yes, it is worth fixing, some scripts can run twice the speed.] */static char *regatom(flagp)int *flagp;{ register char *ret; int flags; *flagp = WORST; /* Tentatively. */ switch (*regparse++) { case '^': ret = regnode(BOL); break; case '$': ret = regnode(EOL); break; case '.': ret = regnode(ANY); *flagp |= HASWIDTH|SIMPLE; break; case '[': ret = regclass(); *flagp |= HASWIDTH|SIMPLE; break; case '(': ret = reg(1, &flags); if (ret == NULL) return(NULL); *flagp |= flags&(HASWIDTH|SPSTART); break; case '|': case ')': FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */ break; case '?': case '+': case '*': FAIL("?+* follows nothing in regexp"); break; case '\\': switch (*regparse) { case 'w': ret = regnode(ALNUM); *flagp |= HASWIDTH|SIMPLE; regparse++; break; case 'W': ret = regnode(NALNUM); *flagp |= HASWIDTH|SIMPLE; regparse++; break; case 'b': ret = regnode(BOUND); *flagp |= SIMPLE; regparse++; break; case 'B': ret = regnode(NBOUND); *flagp |= SIMPLE; regparse++; break; case 's': ret = regnode(SPACE); *flagp |= HASWIDTH|SIMPLE; regparse++; break; case 'S': ret = regnode(NSPACE); *flagp |= HASWIDTH|SIMPLE; regparse++; break; case 'd': ret = regnode(DIGIT); *flagp |= HASWIDTH|SIMPLE; regparse++; break; case 'D': ret = regnode(NDIGIT); *flagp |= HASWIDTH|SIMPLE; regparse++; break; case 'n': case 'r': case 't': case 'f': case 'e': case 'a': case 'x': case 'c': case '0': goto defchar; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { int num = atoi(regparse); if (num > 9 && num >= regnpar) goto defchar; else { regsawback = 1; ret = reganode(REF, num); while (isDIGIT(*regparse)) regparse++; *flagp |= SIMPLE; } } break; case '\0': if (regparse >= regxend) FAIL("trailing \\ in regexp"); /* FALL THROUGH */ default: goto defchar; } break; default: { register int len; register char ender; register char *p; char *oldp; int numlen; defchar: ret = regnode(EXACTLY); regc(0); /* save spot for len */ for (len=0, p=regparse-1; len < 127 && p < regxend; len++) { oldp = p; switch (*p) { case '^': case '$': case '.': case '[': case '(': case ')': case '|': goto loopdone; case '\\': switch (*++p) { case 'w': case 'W': case 'b': case 'B': case 's': case 'S': case 'd': case 'D': --p; goto loopdone; case 'n': ender = '\n'; p++; break; case 'r': ender = '\r'; p++; break; case 't': ender = '\t'; p++; break; case 'f': ender = '\f'; p++; break; case 'e': ender = '\033'; p++; break; case 'a': ender = '\007'; p++; break; case 'x': ender = scanhex(++p, 2, &numlen); p += numlen; break; case 'c': p++; ender = *p++; if (isLOWER(ender)) ender = toupper(ender); ender ^= 64; break; case '0': case '1': case '2': case '3':case '4': case '5': case '6': case '7': case '8':case '9': if (*p == '0' || (isDIGIT(p[1]) && atoi(p) >= regnpar) ) { ender = scanoct(p, 3, &numlen); p += numlen; } else { --p; goto loopdone; } break; case '\0': if (p >= regxend) FAIL("trailing \\ in regexp"); /* FALL THROUGH */ default: ender = *p++; break; } break; default: ender = *p++; break; } if (regfold && isUPPER(ender)) ender = tolower(ender); if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; else { len++; regc(ender); } break; } regc(ender); } loopdone: regparse = p; if (len <= 0) FAIL("internal disaster in regexp"); *flagp |= HASWIDTH; if (len == 1) *flagp |= SIMPLE; if (regcode != ®dummy) *OPERAND(ret) = len; regc('\0'); } break; } return(ret);}static voidregset(bits,def,c)char *bits;int def;register int c;{ if (regcode == ®dummy) return; c &= 255; if (def) bits[c >> 3] &= ~(1 << (c & 7)); else bits[c >> 3] |= (1 << (c & 7));}static char *regclass(){ register char *bits; register int class; register int lastclass; register int range = 0; register char *ret; register int def; int numlen; ret = regnode(ANYOF); if (*regparse == '^') { /* Complement of range. */ regparse++; def = 0; } else { def = 255; } bits = regcode; for (class = 0; class < 32; class++) regc(def); if (*regparse == ']' || *regparse == '-') goto skipcond; /* allow 1st char to be ] or - */ while (regparse < regxend && *regparse != ']') { skipcond: class = UCHARAT(regparse++); if (class == '\\') { class = UCHARAT(regparse++); switch (class) { case 'w': for (class = 0; class < 256; class++) if (isALNUM(class))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -