regex.c

来自「开放源码的编译器open watcom 1.6.0版的源代码」· C语言 代码 · 共 1,220 行 · 第 1/3 页

C
1,220
字号
        *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);

        if (op == '*' && (flags&SIMPLE))
                reginsert(STAR, ret);
        else if (op == '*') {
                /* Emit x* as (x&|), where & means "self". */
                reginsert(BRANCH, ret);                 /* Either x */
                regoptail(ret, regnode(BACK));          /* and loop */
                regoptail(ret, ret);                    /* back */
                regtail(ret, regnode(BRANCH));          /* or */
                regtail(ret, regnode(NOTHING));         /* null. */
        } else if (op == '+' && (flags&SIMPLE))
                reginsert(PLUS, ret);
        else if (op == '+') {
                /* Emit x+ as x(&|), where & means "self". */
                next = regnode(BRANCH);                 /* Either */
                regtail(ret, next);
                regtail(regnode(BACK), ret);            /* loop back */
                regtail(next, regnode(BRANCH));         /* or */
                regtail(ret, regnode(NOTHING));         /* null. */
        } else if (op == '?') {
                /* Emit x? as (x|) */
                reginsert(BRANCH, ret);                 /* Either x */
                regtail(ret, regnode(BRANCH));          /* or */
                next = regnode(NOTHING);                /* null. */
                regtail(ret, next);
                regoptail(ret, next);
        }
        regparse++;
        if (ISMULT(*regparse))
                FAIL("nested *?+");

        return(ret);
}

/*
 - regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
static char *
regatom(flagp)
int *flagp;
{
        register char *ret;
        int flags;

        *flagp = WORST;         /* Tentatively. */

        switch (*regparse++) {
        case '^':
                ret = regnode(BOL);
                break;
        case '$':
                ret = regnode(EOL);
                break;
        case '.':
                ret = regnode(ANY);
                *flagp |= HASWIDTH|SIMPLE;
                break;
        case '[': {
                        register int class;
                        register int classend;

                        if (*regparse == '^') { /* Complement of range. */
                                ret = regnode(ANYBUT);
                                regparse++;
                        } else
                                ret = regnode(ANYOF);
                        if (*regparse == ']' || *regparse == '-')
                                regc(*regparse++);
                        while (*regparse != '\0' && *regparse != ']') {
                                if (*regparse == '-') {
                                        regparse++;
                                        if (*regparse == ']' || *regparse == '\0')
                                                regc('-');
                                        else {
                                                class = UCHARAT(regparse-2)+1;
                                                classend = UCHARAT(regparse);
                                                if (class > classend+1)
                                                        FAIL("invalid [] range");
                                                for (; class <= classend; class++)
                                                        regc(class);
                                                regparse++;
                                        }
                                } else
                                        regc(*regparse++);
                        }
                        regc('\0');
                        if (*regparse != ']')
                                FAIL("unmatched []");
                        regparse++;
                        *flagp |= HASWIDTH|SIMPLE;
                }
                break;
        case '(':
                ret = reg(1, &flags);
                if (ret == NULL)
                        return(NULL);
                *flagp |= flags&(HASWIDTH|SPSTART);
                break;
        case '\0':
        case '|':
        case ')':
                FAIL("internal urp");   /* Supposed to be caught earlier. */
                break;
        case '?':
        case '+':
        case '*':
                FAIL("?+* follows nothing");
                break;
        case '\\':
                if (*regparse == '\0')
                        FAIL("trailing \\");
                ret = regnode(EXACTLY);
                regc(*regparse++);
                regc('\0');
                *flagp |= HASWIDTH|SIMPLE;
                break;
        default: {
                        register int len;
                        register char ender;

                        regparse--;
                        len = strcspn(regparse, META);
                        if (len <= 0)
                                FAIL("internal disaster");
                        ender = *(regparse+len);
                        if (len > 1 && ISMULT(ender))
                                len--;          /* Back off clear of ?+* operand. */
                        *flagp |= HASWIDTH;
                        if (len == 1)
                                *flagp |= SIMPLE;
                        ret = regnode(EXACTLY);
                        while (len > 0) {
                                regc(*regparse++);
                                len--;
                        }
                        regc('\0');
                }
                break;
        }

        return(ret);
}

/*
 - regnode - emit a node
 */
static char *                   /* Location. */
regnode(op)
char op;
{
        register char *ret;
        register char *ptr;

        ret = regcode;
        if (ret == &regdummy) {
                regsize += 3;
                return(ret);
        }

        ptr = ret;
        *ptr++ = op;
        *ptr++ = '\0';          /* Null "next" pointer. */
        *ptr++ = '\0';
        regcode = ptr;

        return(ret);
}

/*
 - regc - emit (if appropriate) a byte of code
 */
static void
regc(b)
char b;
{
        if (regcode != &regdummy)
                *regcode++ = b;
        else
                regsize++;
}

/*
 - reginsert - insert an operator in front of already-emitted operand
 *
 * Means relocating the operand.
 */
static void
reginsert(op, opnd)
char op;
char *opnd;
{
        register char *src;
        register char *dst;
        register char *place;

        if (regcode == &regdummy) {
                regsize += 3;
                return;
        }

        src = regcode;
        regcode += 3;
        dst = regcode;
        while (src > opnd)
                *--dst = *--src;

        place = opnd;           /* Op node, where operand used to be. */
        *place++ = op;
        *place++ = '\0';
        *place++ = '\0';
}

/*
 - regtail - set the next-pointer at the end of a node chain
 */
static void
regtail(p, val)
char *p;
char *val;
{
        register char *scan;
        register char *temp;
        register int offset;

        if (p == &regdummy)
                return;

        /* Find last node. */
        scan = p;
        for (;;) {
                temp = regnext(scan);
                if (temp == NULL)
                        break;
                scan = temp;
        }

        if (OP(scan) == BACK)
                offset = scan - val;
        else
                offset = val - scan;
        *(scan+1) = (offset>>8)&0377;
        *(scan+2) = offset&0377;
}

/*
 - regoptail - regtail on operand of first argument; nop if operandless
 */
static void
regoptail(p, val)
char *p;
char *val;
{
        /* "Operandless" and "op != BRANCH" are synonymous in practice. */
        if (p == NULL || p == &regdummy || OP(p) != BRANCH)
                return;
        regtail(OPERAND(p), val);
}

/*
 * regexec and friends
 */

/*
 * Global work variables for regexec().
 */
static char *reginput;          /* String-input pointer. */
static char *regbol;            /* Beginning of input, for ^ check. */
static char **regstartp;        /* Pointer to startp array. */
static char **regendp;          /* Ditto for endp. */

/*
 * Forwards.
 */
STATIC int regtry();
STATIC int regmatch();
STATIC int regrepeat();

#ifdef DEBUG
int regnarrate = 0;
void regdump();
STATIC char *regprop();
#endif

/*
 - regexec - match a regexp against a string
 */
int
regexec(prog, string)
register regexp *prog;
register char *string;
{
        register char *s;
        extern char *strchr();

        /* Be paranoid... */
        if (prog == NULL || string == NULL) {
                regerror("NULL parameter");
                return(0);
        }

        /* Check validity of program. */
        if (UCHARAT(prog->program) != MAGIC) {
                regerror("corrupted program");
                return(0);
        }

        /* If there is a "must appear" string, look for it. */
        if (prog->regmust != NULL) {
                s = string;
                while ((s = strchr(s, prog->regmust[0])) != NULL) {
                        if (strncmp(s, prog->regmust, prog->regmlen) == 0)
                                break;  /* Found it. */
                        s++;
                }
                if (s == NULL)  /* Not present. */
                        return(0);
        }

        /* Mark beginning of line for ^ . */
        regbol = string;

        /* Simplest case:  anchored match need be tried only once. */
        if (prog->reganch)
                return(regtry(prog, string));

        /* Messy cases:  unanchored match. */
        s = string;
        if (prog->regstart != '\0')
                /* We know what char it must start with. */
                while ((s = strchr(s, prog->regstart)) != NULL) {
                        if (regtry(prog, s))
                                return(1);
                        s++;
                }
        else
                /* We don't -- general case. */
                do {
                        if (regtry(prog, s))
                                return(1);
                } while (*s++ != '\0');

        /* Failure. */
        return(0);
}

/*
 - regtry - try match at specific point
 */
static int                      /* 0 failure, 1 success */
regtry(prog, string)
regexp *prog;
char *string;
{
        register int i;
        register char **sp;
        register char **ep;

        reginput = string;
        regstartp = prog->startp;
        regendp = prog->endp;

        sp = prog->startp;
        ep = prog->endp;
        for (i = NSUBEXP; i > 0; i--) {
                *sp++ = NULL;
                *ep++ = NULL;
        }
        if (regmatch(prog->program + 1)) {
                prog->startp[0] = string;
                prog->endp[0] = reginput;
                return(1);
        } else
                return(0);
}

/*
 - regmatch - main matching routine
 *
 * Conceptually the strategy is simple:  check to see whether the current
 * node matches, call self recursively to see whether the rest matches,
 * and then act accordingly.  In practice we make some effort to avoid
 * recursion, in particular by going through "ordinary" nodes (that don't
 * need to know whether the rest of the match failed) by a loop instead of
 * by recursion.
 */
static int                      /* 0 failure, 1 success */
regmatch(prog)
char *prog;
{
        register char *scan;    /* Current node. */
        char *next;             /* Next node. */
        extern char *strchr();

        scan = prog;
#ifdef DEBUG
        if (scan != NULL && regnarrate)
                fprintf(stderr, "%s(\n", regprop(scan));
#endif
        while (scan != NULL) {
#ifdef DEBUG
                if (regnarrate)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?