regex.c

来自「开放源码的编译器open watcom 1.6.0版的源代码」· C语言 代码 · 共 1,201 行 · 第 1/3 页

C
1,201
字号
/*
 * regcomp and regexec -- regsub and regerror are elsewhere
 *
 *      Copyright (c) 1986 by University of Toronto.
 *      Written by Henry Spencer.  Not derived from licensed software.
 *
 *      Permission is granted to anyone to use this software for any
 *      purpose on any computer system, and to redistribute it freely,
 *      subject to the following restrictions:
 *
 *      1. The author is not responsible for the consequences of use of
 *              this software, no matter how awful, even if they arise
 *              from defects in it.
 *
 *      2. The origin of this software must not be misrepresented, either
 *              by explicit claim or by omission.
 *
 *      3. Altered versions must be plainly marked as such, and must not
 *              be misrepresented as being the original software.
 *
 * Beware that some of this code is subtly aware of the way operator
 * precedence is structured in regular expressions.  Serious changes in
 * regular-expression syntax might require a total rethink.
 */
#include <stdio.h>
#include <stdlib.h>
#include "regex.h"
#include "regmagic.h"

/*
 * The "internal use only" fields in regexp.h are present to pass info from
 * compile to execute that permits the execute phase to run lots faster on
 * simple cases.  They are:
 *
 * regstart     char that must begin a match; '\0' if none obvious
 * reganch      is the match anchored (at beginning-of-line only)?
 * regmust      string (pointer into program) that match must include, or NULL
 * regmlen      length of regmust string
 *
 * Regstart and reganch permit very fast decisions on suitable starting points
 * for a match, cutting down the work a lot.  Regmust permits fast rejection
 * of lines that cannot possibly match.  The regmust tests are costly enough
 * that regcomp() supplies a regmust only if the r.e. contains something
 * potentially expensive (at present, the only such thing detected is * or +
 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
 * supplied because the test in regexec() needs it and regcomp() is computing
 * it anyway.
 */

/*
 * Structure for regexp "program".  This is essentially a linear encoding
 * of a nondeterministic finite-state machine (aka syntax charts or
 * "railroad normal form" in parsing technology).  Each node is an opcode
 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
 * all nodes except BRANCH implement concatenation; a "next" pointer with
 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
 * have one of the subtle syntax dependencies:  an individual BRANCH (as
 * opposed to a collection of them) is never concatenated with anything
 * because of operator precedence.)  The operand of some types of node is
 * a literal string; for others, it is a node leading into a sub-FSM.  In
 * particular, the operand of a BRANCH node is the first node of the branch.
 * (NB this is *not* a tree structure:  the tail of the branch connects
 * to the thing following the set of BRANCHes.)  The opcodes are:
 */

/* definition   number  opnd?   meaning */
#define END     0       /* no   End of program. */
#define BOL     1       /* no   Match "" at beginning of line. */
#define EOL     2       /* no   Match "" at end of line. */
#define ANY     3       /* no   Match any one character. */
#define ANYOF   4       /* str  Match any character in this string. */
#define ANYBUT  5       /* str  Match any character not in this string. */
#define BRANCH  6       /* node Match this alternative, or the next... */
#define BACK    7       /* no   Match "", "next" ptr points backward. */
#define EXACTLY 8       /* str  Match this string. */
#define NOTHING 9       /* no   Match empty string. */
#define STAR    10      /* node Match this (simple) thing 0 or more times. */
#define PLUS    11      /* node Match this (simple) thing 1 or more times. */
#define OPEN    20      /* no   Mark this point in input as start of #n. */
                        /*      OPEN+1 is number 1, etc. */
#define CLOSE   30      /* no   Analogous to OPEN. */

/*
 * Opcode notes:
 *
 * BRANCH       The set of branches constituting a single choice are hooked
 *              together with their "next" pointers, since precedence prevents
 *              anything being concatenated to any individual branch.  The
 *              "next" pointer of the last BRANCH in a choice points to the
 *              thing following the whole choice.  This is also where the
 *              final "next" pointer of each individual branch points; each
 *              branch starts with the operand node of a BRANCH node.
 *
 * BACK         Normal "next" pointers all implicitly point forward; BACK
 *              exists to make loop structures possible.
 *
 * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
 *              BRANCH structures using BACK.  Simple cases (one character
 *              per match) are implemented with STAR and PLUS for speed
 *              and to minimize recursive plunges.
 *
 * OPEN,CLOSE   ...are numbered at compile time.
 */

/*
 * A node is one char of opcode followed by two chars of "next" pointer.
 * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 * value is a positive offset from the opcode of the node containing it.
 * An operand, if any, simply follows the node.  (Note that much of the
 * code generation knows about this implicit relationship.)
 *
 * Using two bytes for the "next" pointer is vast overkill for most things,
 * but allows patterns to get big without disasters.
 */
#define OP(p)   (*(p))
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
#define OPERAND(p)      ((p) + 3)

/*
 * See regmagic.h for one further detail of program structure.
 */


/*
 * Utility definitions.
 */
#ifndef CHARBITS
#define UCHARAT(p)      ((int)*(unsigned char *)(p))
#else
#define UCHARAT(p)      ((int)*(p)&CHARBITS)
#endif

#define FAIL(m) { regerror(m); return(NULL); }
#define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?')
#define META    "^$.[()|?+*\\"

/*
 * Flags to be passed up and down.
 */
#define HASWIDTH        01      /* Known never to match null string. */
#define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
#define SPSTART         04      /* Starts with * or +. */
#define WORST           0       /* Worst case. */

/*
 * Global work variables for regcomp().
 */
static char *regparse;          /* Input-scan pointer. */
static int regnpar;             /* () count. */
static char regdummy;
static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
static long regsize;            /* Code size. */

/*
 * Forward declarations for regcomp()'s friends.
 */
#ifndef STATIC
#define STATIC  static
#endif
STATIC char *reg();
STATIC char *regbranch();
STATIC char *regpiece();
STATIC char *regatom();
STATIC char *regnode();
STATIC char *regnext();
STATIC void regc();
STATIC void reginsert();
STATIC void regtail();
STATIC void regoptail();
#ifdef STRCSPN
STATIC int strcspn();
#endif

/*
 - regcomp - compile a regular expression into internal code
 *
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because free() must be able to free it all.)
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled regexp.
 */
regexp *
regcomp( char *exp)
{
        register regexp *r;
        register char *scan;
        register char *longest;
        register int len;
        int flags;

        if (exp == NULL)
                FAIL("NULL argument");

        /* First pass: determine size, legality. */
        regparse = exp;
        regnpar = 1;
        regsize = 0L;
        regcode = &regdummy;
        regc(MAGIC);
        if (reg(0, &flags) == NULL)
                return(NULL);

        /* Small enough for pointer-storage convention? */
        if (regsize >= 32767L)          /* Probably could be 65535L. */
                FAIL("regexp too big");

        /* Allocate space. */
        r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
        if (r == NULL)
                FAIL("out of space");

        /* Second pass: emit code. */
        regparse = exp;
        regnpar = 1;
        regcode = r->program;
        regc(MAGIC);
        if (reg(0, &flags) == NULL)
                return(NULL);

        /* Dig out information for optimizations. */
        r->regstart = '\0';     /* Worst-case defaults. */
        r->reganch = 0;
        r->regmust = NULL;
        r->regmlen = 0;
        scan = r->program+1;                    /* First BRANCH. */
        if (OP(regnext(scan)) == END) {         /* Only one top-level choice. */
                scan = OPERAND(scan);

                /* Starting-point info. */
                if (OP(scan) == EXACTLY)
                        r->regstart = *OPERAND(scan);
                else if (OP(scan) == BOL)
                        r->reganch++;

                /*
                 * If there's something expensive in the r.e., find the
                 * longest literal string that must appear and make it the
                 * regmust.  Resolve ties in favor of later strings, since
                 * the regstart check works with the beginning of the r.e.
                 * and avoiding duplication strengthens checking.  Not a
                 * strong reason, but sufficient in the absence of others.
                 */
                if (flags&SPSTART) {
                        longest = NULL;
                        len = 0;
                        for (; scan != NULL; scan = regnext(scan))
                                if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
                                        longest = OPERAND(scan);
                                        len = strlen(OPERAND(scan));
                                }
                        r->regmust = longest;
                        r->regmlen = len;
                }
        }

        return(r);
}

/*
 - reg - regular expression, i.e. main body or parenthesized thing
 *
 * Caller must absorb opening parenthesis.
 *
 * Combining parenthesis handling with the base level of regular expression
 * is a trifle forced, but the need to tie the tails of the branches to what
 * follows makes it hard to avoid.
 */
static char *
reg(int paren, int *flagp)
{
        register char *ret;
        register char *br;
        register char *ender;
        register int parno;
        int flags;

        *flagp = HASWIDTH;      /* Tentatively. */

        /* Make an OPEN node, if parenthesized. */
        if (paren) {
                if (regnpar >= NSUBEXP)
                        FAIL("too many ()");
                parno = regnpar;
                regnpar++;
                ret = regnode(OPEN+parno);
        } else
                ret = NULL;

        /* Pick up the branches, linking them together. */
        br = regbranch(&flags);
        if (br == NULL)
                return(NULL);
        if (ret != NULL)
                regtail(ret, br);       /* OPEN -> first. */
        else
                ret = br;
        if (!(flags&HASWIDTH))
                *flagp &= ~HASWIDTH;
        *flagp |= flags&SPSTART;
        while (*regparse == '|') {
                regparse++;
                br = regbranch(&flags);
                if (br == NULL)
                        return(NULL);
                regtail(ret, br);       /* BRANCH -> BRANCH. */
                if (!(flags&HASWIDTH))
                        *flagp &= ~HASWIDTH;
                *flagp |= flags&SPSTART;
        }

        /* Make a closing node, and hook it on the end. */
        ender = regnode((paren) ? CLOSE+parno : END);   
        regtail(ret, ender);

        /* Hook the tails of the branches to the closing node. */
        for (br = ret; br != NULL; br = regnext(br))
                regoptail(br, ender);

        /* Check for proper termination. */
        if (paren && *regparse++ != ')') {
                FAIL("unmatched ()");
        } else if (!paren && *regparse != '\0') {
                if (*regparse == ')') {
                        FAIL("unmatched ()");
                } else
                        FAIL("junk on end");    /* "Can't happen". */
                /* NOTREACHED */
        }

        return(ret);
}

/*
 - regbranch - one alternative of an | operator
 *
 * Implements the concatenation operator.
 */
static char *
regbranch(int *flagp)
{
        register char *ret;
        register char *chain;
        register char *latest;
        int flags;

        *flagp = WORST;         /* Tentatively. */

        ret = regnode(BRANCH);
        chain = NULL;
        while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
                latest = regpiece(&flags);
                if (latest == NULL)
                        return(NULL);
                *flagp |= flags&HASWIDTH;
                if (chain == NULL)      /* First piece. */
                        *flagp |= flags&SPSTART;
                else
                        regtail(chain, latest);
                chain = latest;
        }
        if (chain == NULL)      /* Loop ran zero times. */
                (void) regnode(NOTHING);

        return(ret);
}

/*
 - regpiece - something followed by possible [*+?]
 *
 * Note that the branching code sequences used for ? and the general cases
 * of * and + are somewhat optimized:  they use the same NOTHING node as
 * both the endmarker for their branch list and the body of the last branch.
 * It might seem that this node could be dispensed with entirely, but the
 * endmarker role is not redundant.
 */
static char *
regpiece(int *flagp)
{
        register char *ret;
        register char op;
        register char *next;
        int flags;

        ret = regatom(&flags);
        if (ret == NULL)
                return(NULL);

        op = *regparse;
        if (!ISMULT(op)) {
                *flagp = flags;
                return(ret);
        }

        if (!(flags&HASWIDTH) && op != '?')

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?