regexp.c

来自「开放源码的编译器open watcom 1.6.0版的源代码」· C语言 代码 · 共 1,314 行 · 第 1/3 页

C
1,314
字号
/****************************************************************************
*
*                            Open Watcom Project
*
*    Portions Copyright (c) 1983-2002 Sybase, Inc. All Rights Reserved.
*
*  ========================================================================
*
*    This file contains Original Code and/or Modifications of Original
*    Code as defined in and that are subject to the Sybase Open Watcom
*    Public License version 1.0 (the 'License'). You may not use this file
*    except in compliance with the License. BY USING THIS FILE YOU AGREE TO
*    ALL TERMS AND CONDITIONS OF THE LICENSE. A copy of the License is
*    provided with the Original Code and Modifications, and is also
*    available at www.sybase.com/developer/opensource.
*
*    The Original Code and all software distributed under the License are
*    distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
*    EXPRESS OR IMPLIED, AND SYBASE AND ALL CONTRIBUTORS HEREBY DISCLAIM
*    ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF
*    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR
*    NON-INFRINGEMENT. Please see the License for the specific language
*    governing rights and limitations under the License.
*
*  ========================================================================
*
* Description:  RegComp and RegExec
*
****************************************************************************/

/*
 * RegComp and RegExec
 *
 *      Copyright (c) 1986 by University of Toronto.
 *      Written by Henry Spencer.  Not derived from licensed software.
 *
 *      Modified By Craig Eisler
 *              - editor specific code/errors
 *              - case insensitive search
 *              - CurrentRegComp
 *              - nomagic
 *              - MakeExpressionNonRegular
 *              - case sensitivity atoms (~ and @)
 *              - SetMajickString
 *              - bug fixes, fun fun fun
 *
 *      Permission is granted to anyone to use this software for any
 *      purpose on any computer system, and to redistribute it freely,
 *      subject to the following restrictions:
 *
 *      1. The author is not responsible for the consequences of use of
 *              this software, no matter how awful, even if they arise
 *              from defects in it.
 *
 *      2. The origin of this software must not be misrepresented, either
 *              by explicit claim or by omission.
 *
 *      3. Altered versions must be plainly marked as such, and must not
 *              be misrepresented as being the original software.
 *
 * Beware that some of this code is subtly aware of the way operator
 * precedence is structured in regular expressions.  Serious changes in
 * regular-expression syntax might require a total rethink.
 */
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#ifdef UNIX
    #include "clibext.h"
//    #include "wndregx.h"
#endif
#include "../h/regexp.h"


/* IMPORTANT: must have ^$\\ FIRST */

char
#ifdef __WATCOMC__
near
#endif
META[] = "^$\\.[()|?+*~@";

#if !defined( REALTABS )
#define REALTABS        RealTabs
char    RealTabs = TRUE;
#endif

#if !defined( CASEIGNORE )
#define CASEIGNORE      CaseIgnore
char    CaseIgnore = TRUE;
#endif

#if !defined( MAGICFLAG )
#define MAGICFLAG       MagicFlag
char    MagicFlag = TRUE;
#endif

#if !defined( MAGICSTR )
#define MAGICSTR        MagicString
char    *MagicString = "()";
#endif

#if !defined( ALLOC )
#define ALLOC           malloc
#include <malloc.h>
#endif

#ifdef STANDALONE_RX
int     RegExpError;
#endif

/* regError - set regular expression error */
static void regError( int rc )
{
    RegExpError = rc;
}

/* StrChr - case sensitive/insensitive version of strchr */
static char *StrChr( char *s, int c )
{
    char        ch;

    ch = c;
    if( CASEIGNORE ) {
        while( ( tolower( *s ) != tolower( ch ) ) && *s != 0 ) {
            s++;
        }
    } else {
        while( *s != ch && *s != 0 ) {
            s++;
        }
    }
    if( !( *s ) ) {
        return( NULL );
    }
    return( s );

}

/*
 * The "internal use only" fields in regexp.h are present to pass info from
 * compile to execute that permits the execute phase to run lots faster on
 * simple cases.  They are:
 *
 * regstart     char that must begin a match; '\0' if none obvious
 * reganch      is the match anchored (at beginning-of-line only)?
 * regmust      string (pointer into program) that match must include, or NULL
 * regmlen      length of regmust string
 *
 * Regstart and reganch permit very fast decisions on suitable starting points
 * for a match, cutting down the work a lot.  Regmust permits fast rejection
 * of lines that cannot possibly match.  The regmust tests are costly enough
 * that RegComp() supplies a regmust only if the r.e. contains something
 * potentially expensive (at present, the only such thing detected is * or +
 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
 * supplied because the test in RegExec() needs it and RegComp() is computing
 * it anyway.
 */

/*
 * Structure for regexp "program".  This is essentially a linear encoding
 * of a nondeterministic finite-state machine (aka syntax charts or
 * "railroad normal form" in parsing technology).  Each node is an opcode
 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
 * all nodes except BRANCH implement concatenation; a "next" pointer with
 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
 * have one of the subtle syntax dependencies:  an individual BRANCH (as
 * opposed to a collection of them) is never concatenated with anything
 * because of operator precedence.)  The operand of some types of node is
 * a literal string; for others, it is a node leading into a sub-FSM.  In
 * particular, the operand of a BRANCH node is the first node of the branch.
 * (NB this is *not* a tree structure:  the tail of the branch connects
 * to the thing following the set of BRANCHes.)  The opcodes are:
 */

/* definition   number  opnd?   meaning */
#define END     0       /* no   End of program. */
#define BOL     1       /* no   Match "" at beginning of line. */
#define EOL     2       /* no   Match "" at end of line. */
#define ANY     3       /* no   Match any one character. */
#define ANYOF   4       /* str  Match any character in this string. */
#define ANYBUT  5       /* str  Match any character not in this string. */
#define BRANCH  6       /* node Match this alternative, or the next... */
#define BACK    7       /* no   Match "", "next" ptr points backward. */
#define EXACTLY 8       /* str  Match this string. */
#define NOTHING 9       /* no   Match empty string. */
#define STAR    10      /* node Match this (simple) thing 0 or more times. */
#define PLUS    11      /* node Match this (simple) thing 1 or more times. */
#define CASEI   12      /* no   Set CASEIGNORE to TRUE */
#define NOCASEI 13      /* no   Set CASEIGNORE to FALSE */
#define OPEN    20      /* no   Mark this point in input as start of #n. */
/*      OPEN+1 is number 1, etc. */
#define CLOSE   40      /* no   Analogous to OPEN. */

/*
 * Opcode notes:
 *
 * BRANCH       The set of branches constituting a single choice are hooked
 *              together with their "next" pointers, since precedence prevents
 *              anything being concatenated to any individual branch.  The
 *              "next" pointer of the last BRANCH in a choice points to the
 *              thing following the whole choice.  This is also where the
 *              final "next" pointer of each individual branch points; each
 *              branch starts with the operand node of a BRANCH node.
 *
 * BACK         Normal "next" pointers all implicitly point forward; BACK
 *              exists to make loop structures possible.
 *
 * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
 *              BRANCH structures using BACK.  Simple cases (one character
 *              per match) are implemented with STAR and PLUS for speed
 *              and to minimize recursive plunges.
 *
 * OPEN,CLOSE   ...are numbered at compile time.
 */

/*
 * A node is one char of opcode followed by two chars of "next" pointer.
 * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 * value is a positive offset from the opcode of the node containing it.
 * An operand, if any, simply follows the node.  (Note that much of the
 * code generation knows about this implicit relationship.)
 *
 * Using two bytes for the "next" pointer is vast overkill for most things,
 * but allows patterns to get big without disasters.
 */
#define OP(p)   (*(p) )
#define NEXT(p) ( ( (*( (p)+1)&0377)<<8) + (*( (p)+2)&0377) )
#define OPERAND(p)      ( (p) + 3)

/* See regmagic.h for one further detail of program structure.  */

/* Utility definitions.  */
#ifndef CHARBITS
#define UCHARAT(p)      ( (int)*(unsigned char *)(p) )
#else
#define UCHARAT(p)      ( (int)*(p)&CHARBITS)
#endif

#define FAIL(m) { regError(m); return(NULL); }
#define ISMULT(c)       ( (c) == '*' || (c) == '+' || (c) == '?')

/* Flags to be passed up and down.  */
#define HASWIDTH        01      /* Known never to match null string. */
#define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
#define SPSTART         04      /* Starts with * or +. */
#define WORST           0       /* Worst case. */

/* Global work variables for RegComp().  */
static char *regparse;          /* Input-scan pointer. */
static int regnpar;             /* () count. */
static char regdummy;
static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
static long regsize;            /* Code size. */

/* Forward declarations for RegComp()'s friends.  */
static char *reg( int paren, int *flagp );
static char *regbranch( int *flagp );
static char *regpiece( int *flagp );
static char *regatom( int *flagp );
static char *regnode( char op );
static char *regnext( char *p );
static void regc( char b );
static void reginsert( char op, char * opnd );
static void regtail( char *p, char *val );
static void regoptail( char *p, char *val );

/*
 - RegComp - compile a regular expression into internal code
 *
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because free() must be able to free it all.)
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled regexp.
 */
regexp *RegComp( char *instr )
{
    regexp      *r;
    char        *scan, *longest, *exp;
    char        buff[MAX_STR*2];
    int         len, flags, i, j, ignmag = FALSE, k;

#ifdef WANT_EXCLAMATION
    if( instr[0] == '!' ) {
        instr++;
        ignmag = TRUE;
    }
#endif

    /*
     * flip roles of magic chars
     */
    if( !ignmag && ( !MAGICFLAG && MAGICSTR != NULL ) ) {
        j = 0;
        k = strlen( instr );
        for( i = 0; i < k; i++ ) {
            if( instr[i] == '\\' ) {
                if( strchr( MAGICSTR, instr[i + 1] ) == NULL ) {
                    buff[j++] = '\\';
                }
                i++;
            } else {
                if( strchr( MAGICSTR, instr[i] ) != NULL ) {
                    buff[j++] = '\\';
                }
            }
            buff[j++] = instr[i];

        }
        buff[j] = 0;
        exp = buff;
    } else {
        exp = instr;
    }

    regError( ERR_NO_ERR );
    if( exp == NULL ) {
        FAIL( ERR_RE_NULL_ARGUMENT );
    }

    /* First pass: determine size, legality. */
    regparse = exp;
    regnpar = 1;
    regsize = 0L;
    regcode = &regdummy;
    regc( MAGIC );
    if( reg( 0, &flags ) == NULL ) {
        return( NULL );
    }

    /* Allocate space. */
    r = ALLOC( sizeof( regexp ) + ( unsigned ) regsize );

    /* Second pass: emit code. */
    regparse = exp;
    regnpar = 1;
    regcode = r->program;
    regc( MAGIC );
    if( reg( 0, &flags ) == NULL ) {
        return( NULL );
    }

    /* Dig out information for optimizations. */
    r->regstart = '\0';     /* Worst-case defaults. */
    r->reganch = 0;
    r->regmust = NULL;
    r->regmlen = 0;
    scan = r->program + 1;                    /* First BRANCH. */
    if( OP( regnext( scan ) ) == END ) { /* Only one top-level choice. */
        scan = OPERAND( scan );

        /* Starting-point info. */
        if( OP( scan ) == EXACTLY ) {
            r->regstart = *OPERAND( scan );
        } else if( OP( scan ) == BOL ) {
            r->reganch++;
        }

        /*
         * If there's something expensive in the r.e., find the
         * longest literal string that must appear and make it the
         * regmust.  Resolve ties in favor of later strings, since
         * the regstart check works with the beginning of the r.e.
         * and avoiding duplication strengthens checking.  Not a
         * strong reason, but sufficient in the absence of others.
         */
        if( flags & SPSTART ) {
            longest = NULL;
            len = 0;
            for( ; scan != NULL; scan = regnext( scan ) ) {
                if( OP( scan ) == EXACTLY &&
                                          ( int ) strlen( OPERAND( scan ) ) >= len ) {
                    longest = OPERAND( scan );
                    len = strlen( OPERAND( scan ) );
                }
            }
            r->regmust = longest;
            r->regmlen = len;
        }
    }

    return( r );
}

/*
 * reg - regular expression, i.e. main body or parenthesized thing
 *
 * Caller must absorb opening parenthesis.
 *
 * Combining parenthesis handling with the base level of regular expression
 * is a trifle forced, but the need to tie the tails of the branches to what
 * follows makes it hard to avoid.
 */
static char *reg( int paren, int *flagp )
{
    char        *ret, *br, *ender;
    int         parno, flags;

    *flagp = HASWIDTH;      /* Tentatively. */

    /* Make an OPEN node, if parenthesized. */
    if( paren ) {
        if( regnpar >= NSUBEXP ) {
            FAIL( ERR_RE_TOO_MANY_ROUND_BRACKETS );
        }
        parno = regnpar;
        regnpar++;
        ret = regnode( OPEN + parno );
    } else {
        ret = NULL;
    }

    /* Pick up the branches, linking them together. */
    br = regbranch( &flags );
    if( br == NULL ) {
        return( NULL );
    }
    if( ret != NULL ) {
        regtail( ret, br );       /* OPEN -> first. */
    } else {
        ret = br;
    }
    if( !( flags & HASWIDTH ) ) {
        *flagp &= ~HASWIDTH;
    }
    *flagp |= flags & SPSTART;
    while( *regparse == '|' ) {
        regparse++;
        br = regbranch( &flags );
        if( br == NULL ) {
            return( NULL );

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?