📄 research.cxx
字号:
case '*': /* match 0 or more... */
case '+': /* match 1 or more... */
if (p == pat)
return badpat("Empty closure");
lp = sp; /* previous opcode */
if (*lp == CLO) /* equivalence... */
break;
switch(*lp) {
case BOL:
case BOT:
case EOT:
case BOW:
case EOW:
case REF:
return badpat("Illegal closure");
default:
break;
}
if (*p == '+')
for (sp = mp; lp < sp; lp++)
*mp++ = *lp;
*mp++ = END;
*mp++ = END;
sp = mp;
while (--mp > lp)
*mp = mp[-1];
*mp = CLO;
mp = sp;
break;
case '\\': /* tags, backrefs... */
i++;
switch(*++p) {
case '<':
*mp++ = BOW;
break;
case '>':
if (*sp == BOW)
return badpat("Null pattern inside \\<\\>");
*mp++ = EOW;
break;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
n = *p-'0';
if (tagi > 0 && tagstk[tagi] == n)
return badpat("Cyclical reference");
if (tagc > n) {
*mp++ = static_cast<char>(REF);
*mp++ = static_cast<char>(n);
}
else
return badpat("Undetermined reference");
break;
case 'a':
case 'b':
case 'n':
case 'f':
case 'r':
case 't':
case 'v':
*mp++ = CHR;
*mp++ = escapeValue(*p);
break;
default:
if (!posix && *p == '(') {
if (tagc < MAXTAG) {
tagstk[++tagi] = tagc;
*mp++ = BOT;
*mp++ = static_cast<char>(tagc++);
}
else
return badpat("Too many \\(\\) pairs");
} else if (!posix && *p == ')') {
if (*sp == BOT)
return badpat("Null pattern inside \\(\\)");
if (tagi > 0) {
*mp++ = static_cast<char>(EOT);
*mp++ = static_cast<char>(tagstk[tagi--]);
}
else
return badpat("Unmatched \\)");
} else {
*mp++ = CHR;
*mp++ = *p;
}
}
break;
default : /* an ordinary char */
if (posix && *p == '(') {
if (tagc < MAXTAG) {
tagstk[++tagi] = tagc;
*mp++ = BOT;
*mp++ = static_cast<char>(tagc++);
}
else
return badpat("Too many () pairs");
} else if (posix && *p == ')') {
if (*sp == BOT)
return badpat("Null pattern inside ()");
if (tagi > 0) {
*mp++ = static_cast<char>(EOT);
*mp++ = static_cast<char>(tagstk[tagi--]);
}
else
return badpat("Unmatched )");
} else if (caseSensitive) {
*mp++ = CHR;
*mp++ = *p;
} else {
*mp++ = CCL;
mask = 0;
ChSetWithCase(*p, false);
for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
*mp++ = static_cast<char>(mask ^ bittab[n]);
}
break;
}
sp = lp;
}
if (tagi > 0)
return badpat((posix ? "Unmatched (" : "Unmatched \\("));
*mp = END;
sta = OKP;
return 0;
}
/*
* RESearch::Execute:
* execute nfa to find a match.
*
* special cases: (nfa[0])
* BOL
* Match only once, starting from the
* beginning.
* CHR
* First locate the character without
* calling PMatch, and if found, call
* PMatch for the remaining string.
* END
* RESearch::Compile failed, poor luser did not
* check for it. Fail fast.
*
* If a match is found, bopat[0] and eopat[0] are set
* to the beginning and the end of the matched fragment,
* respectively.
*
*/
int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
char c;
int ep = NOTFOUND;
char *ap = nfa;
bol = lp;
failure = 0;
Clear();
switch(*ap) {
case BOL: /* anchored: match from BOL only */
ep = PMatch(ci, lp, endp, ap);
break;
case EOL: /* just searching for end of line normal path doesn't work */
if (*(ap+1) == END) {
lp = endp;
ep = lp;
break;
} else {
return 0;
}
case CHR: /* ordinary char: locate it fast */
c = *(ap+1);
while ((lp < endp) && (ci.CharAt(lp) != c))
lp++;
if (lp >= endp) /* if EOS, fail, else fall thru. */
return 0;
default: /* regular matching all the way. */
while (lp < endp) {
ep = PMatch(ci, lp, endp, ap);
if (ep != NOTFOUND)
break;
lp++;
}
break;
case END: /* munged automaton. fail always */
return 0;
}
if (ep == NOTFOUND)
return 0;
bopat[0] = lp;
eopat[0] = ep;
return 1;
}
/*
* PMatch: internal routine for the hard part
*
* This code is partly snarfed from an early grep written by
* David Conroy. The backref and tag stuff, and various other
* innovations are by oz.
*
* special case optimizations: (nfa[n], nfa[n+1])
* CLO ANY
* We KNOW .* will match everything upto the
* end of line. Thus, directly go to the end of
* line, without recursive PMatch calls. As in
* the other closure cases, the remaining pattern
* must be matched by moving backwards on the
* string recursively, to find a match for xy
* (x is ".*" and y is the remaining pattern)
* where the match satisfies the LONGEST match for
* x followed by a match for y.
* CLO CHR
* We can again scan the string forward for the
* single char and at the point of failure, we
* execute the remaining nfa recursively, same as
* above.
*
* At the end of a successful match, bopat[n] and eopat[n]
* are set to the beginning and end of subpatterns matched
* by tagged expressions (n = 1 to 9).
*/
extern void re_fail(char *,char);
#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
/*
* skip values for CLO XXX to skip past the closure
*/
#define ANYSKIP 2 /* [CLO] ANY END */
#define CHRSKIP 3 /* [CLO] CHR chr END */
#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */
int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
int op, c, n;
int e; /* extra pointer for CLO */
int bp; /* beginning of subpat... */
int ep; /* ending of subpat... */
int are; /* to save the line ptr. */
while ((op = *ap++) != END)
switch(op) {
case CHR:
if (ci.CharAt(lp++) != *ap++)
return NOTFOUND;
break;
case ANY:
if (lp++ >= endp)
return NOTFOUND;
break;
case CCL:
c = ci.CharAt(lp++);
if (!isinset(ap,c))
return NOTFOUND;
ap += BITBLK;
break;
case BOL:
if (lp != bol)
return NOTFOUND;
break;
case EOL:
if (lp < endp)
return NOTFOUND;
break;
case BOT:
bopat[*ap++] = lp;
break;
case EOT:
eopat[*ap++] = lp;
break;
case BOW:
if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
return NOTFOUND;
break;
case EOW:
if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
return NOTFOUND;
break;
case REF:
n = *ap++;
bp = bopat[n];
ep = eopat[n];
while (bp < ep)
if (ci.CharAt(bp++) != ci.CharAt(lp++))
return NOTFOUND;
break;
case CLO:
are = lp;
switch(*ap) {
case ANY:
while (lp < endp)
lp++;
n = ANYSKIP;
break;
case CHR:
c = *(ap+1);
while ((lp < endp) && (c == ci.CharAt(lp)))
lp++;
n = CHRSKIP;
break;
case CCL:
while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
lp++;
n = CCLSKIP;
break;
default:
failure = true;
//re_fail("closure: bad nfa.", *ap);
return NOTFOUND;
}
ap += n;
while (lp >= are) {
if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
return e;
--lp;
}
return NOTFOUND;
default:
//re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
return NOTFOUND;
}
return lp;
}
/*
* RESearch::Substitute:
* substitute the matched portions of the src in dst.
*
* & substitute the entire matched pattern.
*
* \digit substitute a subpattern, with the given tag number.
* Tags are numbered from 1 to 9. If the particular
* tagged subpattern does not exist, null is substituted.
*/
int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
char c;
int pin;
int bp;
int ep;
if (!*src || !bopat[0])
return 0;
while ((c = *src++) != 0) {
switch(c) {
case '&':
pin = 0;
break;
case '\\':
c = *src++;
if (c >= '0' && c <= '9') {
pin = c - '0';
break;
}
default:
*dst++ = c;
continue;
}
if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
while (ci.CharAt(bp) && bp < ep)
*dst++ = ci.CharAt(bp++);
if (bp < ep)
return 0;
}
}
*dst = (char) 0;
return 1;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -