📄 research.cxx
字号:
case '*': /* match 0 or more... */ case '+': /* match 1 or more... */ if (p == pat) return badpat("Empty closure"); lp = sp; /* previous opcode */ if (*lp == CLO) /* equivalence... */ break; switch(*lp) { case BOL: case BOT: case EOT: case BOW: case EOW: case REF: return badpat("Illegal closure"); default: break; } if (*p == '+') for (sp = mp; lp < sp; lp++) *mp++ = *lp; *mp++ = END; *mp++ = END; sp = mp; while (--mp > lp) *mp = mp[-1]; *mp = CLO; mp = sp; break; case '\\': /* tags, backrefs... */ i++; switch(*++p) { case '<': *mp++ = BOW; break; case '>': if (*sp == BOW) return badpat("Null pattern inside \\<\\>"); *mp++ = EOW; break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': n = *p-'0'; if (tagi > 0 && tagstk[tagi] == n) return badpat("Cyclical reference"); if (tagc > n) { *mp++ = static_cast<char>(REF); *mp++ = static_cast<char>(n); } else return badpat("Undetermined reference"); break; case 'a': case 'b': case 'n': case 'f': case 'r': case 't': case 'v': *mp++ = CHR; *mp++ = escapeValue(*p); break; default: if (!posix && *p == '(') { if (tagc < MAXTAG) { tagstk[++tagi] = tagc; *mp++ = BOT; *mp++ = static_cast<char>(tagc++); } else return badpat("Too many \\(\\) pairs"); } else if (!posix && *p == ')') { if (*sp == BOT) return badpat("Null pattern inside \\(\\)"); if (tagi > 0) { *mp++ = static_cast<char>(EOT); *mp++ = static_cast<char>(tagstk[tagi--]); } else return badpat("Unmatched \\)"); } else { *mp++ = CHR; *mp++ = *p; } } break; default : /* an ordinary char */ if (posix && *p == '(') { if (tagc < MAXTAG) { tagstk[++tagi] = tagc; *mp++ = BOT; *mp++ = static_cast<char>(tagc++); } else return badpat("Too many () pairs"); } else if (posix && *p == ')') { if (*sp == BOT) return badpat("Null pattern inside ()"); if (tagi > 0) { *mp++ = static_cast<char>(EOT); *mp++ = static_cast<char>(tagstk[tagi--]); } else return badpat("Unmatched )"); } else if (caseSensitive) { *mp++ = CHR; *mp++ = *p; } else { *mp++ = CCL; mask = 0; ChSetWithCase(*p, false); for (n = 0; n < BITBLK; bittab[n++] = (char) 0) *mp++ = static_cast<char>(mask ^ bittab[n]); } break; } sp = lp; } if (tagi > 0) return badpat((posix ? "Unmatched (" : "Unmatched \\(")); *mp = END; sta = OKP; return 0;}/* * RESearch::Execute: * execute nfa to find a match. * * special cases: (nfa[0]) * BOL * Match only once, starting from the * beginning. * CHR * First locate the character without * calling PMatch, and if found, call * PMatch for the remaining string. * END * RESearch::Compile failed, poor luser did not * check for it. Fail fast. * * If a match is found, bopat[0] and eopat[0] are set * to the beginning and the end of the matched fragment, * respectively. * */int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) { char c; int ep = NOTFOUND; char *ap = nfa; bol = lp; failure = 0; Clear(); switch(*ap) { case BOL: /* anchored: match from BOL only */ ep = PMatch(ci, lp, endp, ap); break; case EOL: /* just searching for end of line normal path doesn't work */ if (*(ap+1) == END) { lp = endp; ep = lp; break; } else { return 0; } case CHR: /* ordinary char: locate it fast */ c = *(ap+1); while ((lp < endp) && (ci.CharAt(lp) != c)) lp++; if (lp >= endp) /* if EOS, fail, else fall thru. */ return 0; default: /* regular matching all the way. */ while (lp < endp) { ep = PMatch(ci, lp, endp, ap); if (ep != NOTFOUND) break; lp++; } break; case END: /* munged automaton. fail always */ return 0; } if (ep == NOTFOUND) return 0; bopat[0] = lp; eopat[0] = ep; return 1;}/* * PMatch: internal routine for the hard part * * This code is partly snarfed from an early grep written by * David Conroy. The backref and tag stuff, and various other * innovations are by oz. * * special case optimizations: (nfa[n], nfa[n+1]) * CLO ANY * We KNOW .* will match everything upto the * end of line. Thus, directly go to the end of * line, without recursive PMatch calls. As in * the other closure cases, the remaining pattern * must be matched by moving backwards on the * string recursively, to find a match for xy * (x is ".*" and y is the remaining pattern) * where the match satisfies the LONGEST match for * x followed by a match for y. * CLO CHR * We can again scan the string forward for the * single char and at the point of failure, we * execute the remaining nfa recursively, same as * above. * * At the end of a successful match, bopat[n] and eopat[n] * are set to the beginning and end of subpatterns matched * by tagged expressions (n = 1 to 9). */extern void re_fail(char *,char);#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])/* * skip values for CLO XXX to skip past the closure */#define ANYSKIP 2 /* [CLO] ANY END */#define CHRSKIP 3 /* [CLO] CHR chr END */#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) { int op, c, n; int e; /* extra pointer for CLO */ int bp; /* beginning of subpat... */ int ep; /* ending of subpat... */ int are; /* to save the line ptr. */ while ((op = *ap++) != END) switch(op) { case CHR: if (ci.CharAt(lp++) != *ap++) return NOTFOUND; break; case ANY: if (lp++ >= endp) return NOTFOUND; break; case CCL: c = ci.CharAt(lp++); if (!isinset(ap,c)) return NOTFOUND; ap += BITBLK; break; case BOL: if (lp != bol) return NOTFOUND; break; case EOL: if (lp < endp) return NOTFOUND; break; case BOT: bopat[*ap++] = lp; break; case EOT: eopat[*ap++] = lp; break; case BOW: if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp))) return NOTFOUND; break; case EOW: if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp))) return NOTFOUND; break; case REF: n = *ap++; bp = bopat[n]; ep = eopat[n]; while (bp < ep) if (ci.CharAt(bp++) != ci.CharAt(lp++)) return NOTFOUND; break; case CLO: are = lp; switch(*ap) { case ANY: while (lp < endp) lp++; n = ANYSKIP; break; case CHR: c = *(ap+1); while ((lp < endp) && (c == ci.CharAt(lp))) lp++; n = CHRSKIP; break; case CCL: while ((lp < endp) && isinset(ap+1,ci.CharAt(lp))) lp++; n = CCLSKIP; break; default: failure = true; //re_fail("closure: bad nfa.", *ap); return NOTFOUND; } ap += n; while (lp >= are) { if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND) return e; --lp; } return NOTFOUND; default: //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op)); return NOTFOUND; } return lp;}/* * RESearch::Substitute: * substitute the matched portions of the src in dst. * * & substitute the entire matched pattern. * * \digit substitute a subpattern, with the given tag number. * Tags are numbered from 1 to 9. If the particular * tagged subpattern does not exist, null is substituted. */int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) { char c; int pin; int bp; int ep; if (!*src || !bopat[0]) return 0; while ((c = *src++) != 0) { switch(c) { case '&': pin = 0; break; case '\\': c = *src++; if (c >= '0' && c <= '9') { pin = c - '0'; break; } default: *dst++ = c; continue; } if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) { while (ci.CharAt(bp) && bp < ep) *dst++ = ci.CharAt(bp++); if (bp < ep) return 0; } } *dst = (char) 0; return 1;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -