regexp.c
来自「开放源码的编译器open watcom 1.6.0版的源代码」· C语言 代码 · 共 1,192 行 · 第 1/3 页
C
1,192 行
register char* br;
register char* ender;
register int parno;
int flags;
*flagp = HASWIDTH; // Tentatively.
// Make an OPEN node, if parenthesized.
if (paren) {
if (regnpar >= NSUBEXP) {
//RAISE (Error, SYM(CoolRegexp), SYM(Too_Many_Parens),
printf ("CoolRegexp::compile(): Too many parentheses.\n");
abort ();
}
parno = regnpar;
regnpar++;
ret = regnode(OPEN + parno);
}
else
ret = NULL;
// Pick up the branches, linking them together.
br = regbranch(&flags);
if (br == NULL)
return (NULL);
if (ret != NULL)
regtail(ret, br); // OPEN -> first.
else
ret = br;
if (!(flags & HASWIDTH))
*flagp &= ~HASWIDTH;
*flagp |= flags & SPSTART;
while (*regparse == '|') {
regparse++;
br = regbranch(&flags);
if (br == NULL)
return (NULL);
regtail(ret, br); // BRANCH -> BRANCH.
if (!(flags & HASWIDTH))
*flagp &= ~HASWIDTH;
*flagp |= flags & SPSTART;
}
// Make a closing node, and hook it on the end.
ender = regnode((paren) ? CLOSE + parno : END);
regtail(ret, ender);
// Hook the tails of the branches to the closing node.
for (br = ret; br != NULL; br = regnext(br))
regoptail(br, ender);
// Check for proper termination.
if (paren && *regparse++ != ')') {
//RAISE (Error, SYM(CoolRegexp), SYM(Unmatched_Parens),
printf ("CoolRegexp::compile(): Unmatched parentheses.\n");
abort ();
}
else if (!paren && *regparse != '\0') {
if (*regparse == ')') {
//RAISE (Error, SYM(CoolRegexp), SYM(Unmatched_Parens),
printf ("CoolRegexp::compile(): Unmatched parentheses.\n");
abort ();
}
else {
//RAISE (Error, SYM(CoolRegexp), SYM(Internal_Error),
printf ("CoolRegexp::compile(): Internal error.\n");
abort ();
}
// NOTREACHED
}
return (ret);
}
/*
- regbranch - one alternative of an | operator
*
* Implements the concatenation operator.
*/
static char* regbranch (int *flagp) {
register char* ret;
register char* chain;
register char* latest;
int flags;
*flagp = WORST; // Tentatively.
ret = regnode(BRANCH);
chain = NULL;
while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
latest = regpiece(&flags);
if (latest == NULL)
return (NULL);
*flagp |= flags & HASWIDTH;
if (chain == NULL) // First piece.
*flagp |= flags & SPSTART;
else
regtail(chain, latest);
chain = latest;
}
if (chain == NULL) // Loop ran zero times.
regnode(NOTHING);
return (ret);
}
/*
- regpiece - something followed by possible [*+?]
*
* Note that the branching code sequences used for ? and the general cases
* of * and + are somewhat optimized: they use the same NOTHING node as
* both the endmarker for their branch list and the body of the last branch.
* It might seem that this node could be dispensed with entirely, but the
* endmarker role is not redundant.
*/
static char* regpiece (int *flagp) {
register char* ret;
register char op;
register char* next;
int flags;
ret = regatom(&flags);
if (ret == NULL)
return (NULL);
op = *regparse;
if (!ISMULT(op)) {
*flagp = flags;
return (ret);
}
if (!(flags & HASWIDTH) && op != '?') {
//RAISE (Error, SYM(CoolRegexp), SYM(Empty_Operand),
printf ("CoolRegexp::compile() : *+ operand could be empty.\n");
abort ();
}
*flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
if (op == '*' && (flags & SIMPLE))
reginsert(STAR, ret);
else if (op == '*') {
// Emit x* as (x&|), where & means "self".
reginsert(BRANCH, ret); // Either x
regoptail(ret, regnode(BACK)); // and loop
regoptail(ret, ret); // back
regtail(ret, regnode(BRANCH)); // or
regtail(ret, regnode(NOTHING)); // null.
}
else if (op == '+' && (flags & SIMPLE))
reginsert(PLUS, ret);
else if (op == '+') {
// Emit x+ as x(&|), where & means "self".
next = regnode(BRANCH); // Either
regtail(ret, next);
regtail(regnode(BACK), ret); // loop back
regtail(next, regnode(BRANCH)); // or
regtail(ret, regnode(NOTHING)); // null.
}
else if (op == '?') {
// Emit x? as (x|)
reginsert(BRANCH, ret); // Either x
regtail(ret, regnode(BRANCH)); // or
next = regnode(NOTHING);// null.
regtail(ret, next);
regoptail(ret, next);
}
regparse++;
if (ISMULT(*regparse)) {
//RAISE (Error, SYM(CoolRegexp), SYM(Nested_Operand),
printf ("CoolRegexp::compile(): Nested *?+.\n");
abort ();
}
return (ret);
}
/*
- regatom - the lowest level
*
* Optimization: gobbles an entire sequence of ordinary characters so that
* it can turn them into a single node, which is smaller to store and
* faster to run. Backslashed characters are exceptions, each becoming a
* separate node; the code is simpler that way and it's not worth fixing.
*/
static char* regatom (int *flagp) {
register char* ret;
int flags;
*flagp = WORST; // Tentatively.
switch (*regparse++) {
case '^':
ret = regnode(BOL);
break;
case '$':
ret = regnode(EOL);
break;
case '.':
ret = regnode(ANY);
*flagp |= HASWIDTH | SIMPLE;
break;
case '[':{
register int rxpclass;
register int rxpclassend;
if (*regparse == '^') { // Complement of range.
ret = regnode(ANYBUT);
regparse++;
}
else
ret = regnode(ANYOF);
if (*regparse == ']' || *regparse == '-')
regc(*regparse++);
while (*regparse != '\0' && *regparse != ']') {
if (*regparse == '-') {
regparse++;
if (*regparse == ']' || *regparse == '\0')
regc('-');
else {
rxpclass = UCHARAT(regparse - 2) + 1;
rxpclassend = UCHARAT(regparse);
if (rxpclass > rxpclassend + 1) {
//RAISE (Error, SYM(CoolRegexp), SYM(Invalid_Range),
printf ("CoolRegexp::compile(): Invalid range in [].\n");
abort ();
}
for (; rxpclass <= rxpclassend; rxpclass++)
regc(rxpclass);
regparse++;
}
}
else
regc(*regparse++);
}
regc('\0');
if (*regparse != ']') {
//RAISE (Error, SYM(CoolRegexp), SYM(Unmatched_Bracket),
printf ("CoolRegexp::compile(): Unmatched [].\n");
abort ();
}
regparse++;
*flagp |= HASWIDTH | SIMPLE;
}
break;
case '(':
ret = reg(1, &flags);
if (ret == NULL)
return (NULL);
*flagp |= flags & (HASWIDTH | SPSTART);
break;
case '\0':
case '|':
case ')':
//RAISE (Error, SYM(CoolRegexp), SYM(Internal_Error),
printf ("CoolRegexp::compile(): Internal error.\n"); // Never here
abort ();
case '?':
case '+':
case '*':
//RAISE (Error, SYM(CoolRegexp), SYM(No_Operand),
printf ("CoolRegexp::compile(): ?+* follows nothing.\n");
abort ();
case '\\':
if (*regparse == '\0') {
//RAISE (Error, SYM(CoolRegexp), SYM(Trailing_Backslash),
printf ("CoolRegexp::compile(): Trailing backslash.\n");
abort ();
}
ret = regnode(EXACTLY);
regc(*regparse++);
regc('\0');
*flagp |= HASWIDTH | SIMPLE;
break;
default:{
register int len;
register char ender;
regparse--;
len = strcspn(regparse, META);
if (len <= 0) {
//RAISE (Error, SYM(CoolRegexp), SYM(Internal_Error),
printf ("CoolRegexp::compile(): Internal error.\n");
abort ();
}
ender = *(regparse + len);
if (len > 1 && ISMULT(ender))
len--; // Back off clear of ?+* operand.
*flagp |= HASWIDTH;
if (len == 1)
*flagp |= SIMPLE;
ret = regnode(EXACTLY);
while (len > 0) {
regc(*regparse++);
len--;
}
regc('\0');
}
break;
}
return (ret);
}
/*
- regnode - emit a node
Location.
*/
static char* regnode (char op) {
register char* ret;
register char* ptr;
ret = regcode;
if (ret == ®dummy) {
regsize += 3;
return (ret);
}
ptr = ret;
*ptr++ = op;
*ptr++ = '\0'; // Null "next" pointer.
*ptr++ = '\0';
regcode = ptr;
return (ret);
}
/*
- regc - emit (if appropriate) a byte of code
*/
static void regc (char b) {
if (regcode != ®dummy)
*regcode++ = b;
else
regsize++;
}
/*
- reginsert - insert an operator in front of already-emitted operand
*
* Means relocating the operand.
*/
static void reginsert (char op, char* opnd) {
register char* src;
register char* dst;
register char* place;
if (regcode == ®dummy) {
regsize += 3;
return;
}
src = regcode;
regcode += 3;
dst = regcode;
while (src > opnd)
*--dst = *--src;
place = opnd; // Op node, where operand used to be.
*place++ = op;
*place++ = '\0';
*place++ = '\0';
}
/*
- regtail - set the next-pointer at the end of a node chain
*/
static void regtail (char* p, const char* val) {
register char* scan;
register char* temp;
register int offset;
if (p == ®dummy)
return;
// Find last node.
scan = p;
for (;;) {
temp = regnext(scan);
if (temp == NULL)
break;
scan = temp;
}
if (OP(scan) == BACK)
offset = (const char*)scan - val;
else
offset = val - scan;
*(scan + 1) = (offset >> 8) & 0377;
*(scan + 2) = offset & 0377;
}
/*
- regoptail - regtail on operand of first argument; nop if operandless
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?