regexp.c

来自「开放源码的编译器open watcom 1.6.0版的源代码」· C语言 代码 · 共 1,192 行 · 第 1/3 页

C
1,192
字号
    register char* br;
    register char* ender;
    register int   parno;
             int   flags;

    *flagp = HASWIDTH;          // Tentatively.

    // Make an OPEN node, if parenthesized.
    if (paren) {
        if (regnpar >= NSUBEXP) {
          //RAISE (Error, SYM(CoolRegexp), SYM(Too_Many_Parens),
          printf ("CoolRegexp::compile(): Too many parentheses.\n");
          abort ();
        }
        parno = regnpar;
        regnpar++;
        ret = regnode(OPEN + parno);
    }
    else
        ret = NULL;

    // Pick up the branches, linking them together.
    br = regbranch(&flags);
    if (br == NULL)
        return (NULL);
    if (ret != NULL)
        regtail(ret, br);       // OPEN -> first.
    else
        ret = br;
    if (!(flags & HASWIDTH))
        *flagp &= ~HASWIDTH;
    *flagp |= flags & SPSTART;
    while (*regparse == '|') {
        regparse++;
        br = regbranch(&flags);
        if (br == NULL)
            return (NULL);
        regtail(ret, br);       // BRANCH -> BRANCH.
        if (!(flags & HASWIDTH))
            *flagp &= ~HASWIDTH;
        *flagp |= flags & SPSTART;
      }

    // Make a closing node, and hook it on the end.
    ender = regnode((paren) ? CLOSE + parno : END);
    regtail(ret, ender);

    // Hook the tails of the branches to the closing node.
    for (br = ret; br != NULL; br = regnext(br))
        regoptail(br, ender);

    // Check for proper termination.
    if (paren && *regparse++ != ')') {
        //RAISE (Error, SYM(CoolRegexp), SYM(Unmatched_Parens),
        printf ("CoolRegexp::compile(): Unmatched parentheses.\n");
        abort ();
    }
    else if (!paren && *regparse != '\0') {
        if (*regparse == ')') {
            //RAISE (Error, SYM(CoolRegexp), SYM(Unmatched_Parens),
            printf ("CoolRegexp::compile(): Unmatched parentheses.\n");
            abort ();
        }
        else {
            //RAISE (Error, SYM(CoolRegexp), SYM(Internal_Error),
            printf ("CoolRegexp::compile(): Internal error.\n");
            abort ();
        }
        // NOTREACHED
    }
    return (ret);
}


/*
 - regbranch - one alternative of an | operator
 *
 * Implements the concatenation operator.
 */
static char* regbranch (int *flagp) {
    register char* ret;
    register char* chain;
    register char* latest;
    int                  flags;

    *flagp = WORST;             // Tentatively.

    ret = regnode(BRANCH);
    chain = NULL;
    while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
        latest = regpiece(&flags);
        if (latest == NULL)
            return (NULL);
        *flagp |= flags & HASWIDTH;
        if (chain == NULL)      // First piece.
            *flagp |= flags & SPSTART;
        else
            regtail(chain, latest);
        chain = latest;
    }
    if (chain == NULL)          // Loop ran zero times.
        regnode(NOTHING);

    return (ret);
}


/*
 - regpiece - something followed by possible [*+?]
 *
 * Note that the branching code sequences used for ? and the general cases
 * of * and + are somewhat optimized:  they use the same NOTHING node as
 * both the endmarker for their branch list and the body of the last branch.
 * It might seem that this node could be dispensed with entirely, but the
 * endmarker role is not redundant.
 */
static char* regpiece (int *flagp) {
    register char* ret;
    register char  op;
    register char* next;
    int            flags;

    ret = regatom(&flags);
    if (ret == NULL)
        return (NULL);

    op = *regparse;
    if (!ISMULT(op)) {
        *flagp = flags;
        return (ret);
    }

    if (!(flags & HASWIDTH) && op != '?') {
        //RAISE (Error, SYM(CoolRegexp), SYM(Empty_Operand),
        printf ("CoolRegexp::compile() : *+ operand could be empty.\n");
        abort ();
    }
    *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);

    if (op == '*' && (flags & SIMPLE))
        reginsert(STAR, ret);
    else if (op == '*') {
        // Emit x* as (x&|), where & means "self".
        reginsert(BRANCH, ret); // Either x
        regoptail(ret, regnode(BACK));  // and loop
        regoptail(ret, ret);    // back
        regtail(ret, regnode(BRANCH));  // or
        regtail(ret, regnode(NOTHING)); // null.
    }
    else if (op == '+' && (flags & SIMPLE))
        reginsert(PLUS, ret);
    else if (op == '+') {
        // Emit x+ as x(&|), where & means "self".
        next = regnode(BRANCH); // Either
        regtail(ret, next);
        regtail(regnode(BACK), ret);    // loop back
        regtail(next, regnode(BRANCH)); // or
        regtail(ret, regnode(NOTHING)); // null.
    }
    else if (op == '?') {
        // Emit x? as (x|)
        reginsert(BRANCH, ret); // Either x
        regtail(ret, regnode(BRANCH));  // or
        next = regnode(NOTHING);// null.
        regtail(ret, next);
        regoptail(ret, next);
    }
    regparse++;
    if (ISMULT(*regparse)) {
        //RAISE (Error, SYM(CoolRegexp), SYM(Nested_Operand),
        printf ("CoolRegexp::compile(): Nested *?+.\n");
        abort ();
    }
    return (ret);
}


/*
 - regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
static char* regatom (int *flagp) {
    register char* ret;
             int   flags;

    *flagp = WORST;             // Tentatively.

    switch (*regparse++) {
        case '^':
            ret = regnode(BOL);
            break;
        case '$':
            ret = regnode(EOL);
            break;
        case '.':
            ret = regnode(ANY);
            *flagp |= HASWIDTH | SIMPLE;
            break;
        case '[':{
                register int    rxpclass;
                register int    rxpclassend;

                if (*regparse == '^') { // Complement of range.
                    ret = regnode(ANYBUT);
                    regparse++;
                }
                else
                    ret = regnode(ANYOF);
                if (*regparse == ']' || *regparse == '-')
                    regc(*regparse++);
                while (*regparse != '\0' && *regparse != ']') {
                    if (*regparse == '-') {
                        regparse++;
                        if (*regparse == ']' || *regparse == '\0')
                            regc('-');
                        else {
                            rxpclass = UCHARAT(regparse - 2) + 1;
                            rxpclassend = UCHARAT(regparse);
                            if (rxpclass > rxpclassend + 1) {
                               //RAISE (Error, SYM(CoolRegexp), SYM(Invalid_Range),
                               printf ("CoolRegexp::compile(): Invalid range in [].\n");
                               abort ();
                            }
                            for (; rxpclass <= rxpclassend; rxpclass++)
                                regc(rxpclass);
                            regparse++;
                        }
                    }
                    else
                        regc(*regparse++);
                }
                regc('\0');
                if (*regparse != ']') {
                    //RAISE (Error, SYM(CoolRegexp), SYM(Unmatched_Bracket),
                    printf ("CoolRegexp::compile(): Unmatched [].\n");
                    abort ();
                }
                regparse++;
                *flagp |= HASWIDTH | SIMPLE;
            }
            break;
        case '(':
            ret = reg(1, &flags);
            if (ret == NULL)
                return (NULL);
            *flagp |= flags & (HASWIDTH | SPSTART);
            break;
        case '\0':
        case '|':
        case ')':
            //RAISE (Error, SYM(CoolRegexp), SYM(Internal_Error),
            printf ("CoolRegexp::compile(): Internal error.\n"); // Never here
            abort ();
        case '?':
        case '+':
        case '*':
            //RAISE (Error, SYM(CoolRegexp), SYM(No_Operand),
            printf ("CoolRegexp::compile(): ?+* follows nothing.\n");
            abort ();
        case '\\':
            if (*regparse == '\0') {
                //RAISE (Error, SYM(CoolRegexp), SYM(Trailing_Backslash),
                printf ("CoolRegexp::compile(): Trailing backslash.\n");
                abort ();
            }
            ret = regnode(EXACTLY);
            regc(*regparse++);
            regc('\0');
            *flagp |= HASWIDTH | SIMPLE;
            break;
        default:{
                register int    len;
                register char   ender;

                regparse--;
                len = strcspn(regparse, META);
                if (len <= 0) {
                    //RAISE (Error, SYM(CoolRegexp), SYM(Internal_Error),
                    printf ("CoolRegexp::compile(): Internal error.\n");
                    abort ();
                }
                ender = *(regparse + len);
                if (len > 1 && ISMULT(ender))
                    len--;      // Back off clear of ?+* operand.
                *flagp |= HASWIDTH;
                if (len == 1)
                    *flagp |= SIMPLE;
                ret = regnode(EXACTLY);
                while (len > 0) {
                    regc(*regparse++);
                    len--;
                }
                regc('\0');
            }
            break;
    }
    return (ret);
}


/*
 - regnode - emit a node
   Location.
 */
static char* regnode (char op) {
    register char* ret;
    register char* ptr;

    ret = regcode;
    if (ret == &regdummy) {
        regsize += 3;
        return (ret);
    }

    ptr = ret;
    *ptr++ = op;
    *ptr++ = '\0';              // Null "next" pointer.
    *ptr++ = '\0';
    regcode = ptr;

    return (ret);
}


/*
 - regc - emit (if appropriate) a byte of code
 */
static void regc (char b) {
    if (regcode != &regdummy)
        *regcode++ = b;
    else
        regsize++;
}


/*
 - reginsert - insert an operator in front of already-emitted operand
 *
 * Means relocating the operand.
 */
static void reginsert (char op, char* opnd) {
    register char* src;
    register char* dst;
    register char* place;

    if (regcode == &regdummy) {
        regsize += 3;
        return;
    }

    src = regcode;
    regcode += 3;
    dst = regcode;
    while (src > opnd)
        *--dst = *--src;

    place = opnd;               // Op node, where operand used to be.
    *place++ = op;
    *place++ = '\0';
    *place++ = '\0';
}


/*
 - regtail - set the next-pointer at the end of a node chain
 */
static void regtail (char* p, const char* val) {
    register char* scan;
    register char* temp;
    register int   offset;

    if (p == &regdummy)
        return;

    // Find last node.
    scan = p;
    for (;;) {
        temp = regnext(scan);
        if (temp == NULL)
            break;
        scan = temp;
    }

    if (OP(scan) == BACK)
        offset = (const char*)scan - val;
    else
        offset = val - scan;
    *(scan + 1) = (offset >> 8) & 0377;
    *(scan + 2) = offset & 0377;
}


/*
 - regoptail - regtail on operand of first argument; nop if operandless

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?