cs_regexp.cpp

来自「c-smile 一个语法类似与JS 又有点像C++的编译器」· C++ 代码 · 共 1,354 行 · 第 1/3 页
CPP
1,354 行
/*
*
* cs_regexp.cpp
*
* Copyright (c) 2001, 2002
* Andrew Fedoniouk - andrew@terra-informatica.org
* Portions: Serge Kuznetsov -  kuznetsov@deeptown.org
*
* See the file "COPYING" for information on usage 
* and redistribution of this file
*
*/

/*
* regcomp and regexec -- regsub and FAIL are elsewhere
* @(#)regexp.c	1.3 of 18 April 87
*
*	Copyright (c) 1986 by University of Toronto.
*	Written by Henry Spencer.  Not derived from licensed software.
*
*	Permission is granted to anyone to use this software for any
*	purpose on any computer system, and to redistribute it freely,
*	subject to the following restrictions:
*
*	1. The author is not responsible for the consequences of use of
*		this software, no matter how awful, even if they arise
*		from defects in it.
*
*	2. The origin of this software must not be misrepresented, either
*		by explicit claim or by omission.
*
*	3. Altered versions must be plainly marked as such, and must not
*		be misrepresented as being the original software.
*
* Beware that some of this code is subtly aware of the way operator
* precedence is structured in regular expressions.  Serious changes in
* regular-expression syntax might require a total rethink.
*/
#include <stdio.h>
#include <malloc.h>
#include <string.h>

#include "cs_regexp.h"

namespace tool
{
  /*
  * The "internal use only" fields in regexp.h are present to pass info from
  * compile to execute that permits the execute phase to run lots faster on
  * simple cases.  They are:
  *
  * regstart	char that must begin a match; '\0' if none obvious
  * reganch	is the match anchored (at beginning-of-line only)?
  * regmust	string (pointer into program) that match must include, or NULL
  * regmlen	length of regmust string
  *
  * Regstart and reganch permit very fast decisions on suitable starting points
  * for a match, cutting down the work a lot.  Regmust permits fast rejection
  * of lines that cannot possibly match.  The regmust tests are costly enough
  * that regcomp() supplies a regmust only if the r.e. contains something
  * potentially expensive (at present, the only such thing detected is * or +
  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
  * supplied because the test in regexec() needs it and regcomp() is computing
  * it anyway.
  */

  /*
  * Structure for regexp "program".  This is essentially a linear encoding
  * of a nondeterministic finite-state machine (aka syntax charts or
  * "railroad normal form" in parsing technology).  Each node is an opcode
  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  * all nodes except BRANCH implement concatenation; a "next" pointer with
  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  * opposed to a collection of them) is never concatenated with anything
  * because of operator precedence.)  The operand of some types of node is
  * a literal string; for others, it is a node leading into a sub-FSM.  In
  * particular, the operand of a BRANCH node is the first node of the branch.
  * (NB this is *not* a tree structure:  the tail of the branch connects
  * to the thing following the set of BRANCHes.)  The opcodes are:
  */

#if !defined (_WIN32)
#define MAGIC	char(0234)
#else
#define	MAGIC	char(0234)
#endif

  /* definition	number	opnd?	meaning */
#define	END	0	/* no	End of program. */
#define	BOL	1	/* no	Match "" at beginning of line. */
#define	EOL	2	/* no	Match "" at end of line. */
#define	ANY	3	/* no	Match any one character. */
#define	ANYOF	4	/* str	Match any character in this string. */
#define	ANYBUT	5	/* str	Match any character not in this string. */
#define	BRANCH	6	/* node	Match this alternative, or the next... */
#define	BACK	7	/* no	Match "", "next" ptr points backward. */
#define	EXACTLY	8	/* str	Match this string. */
#define	NOTHING	9	/* no	Match empty string. */
#define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
#define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
#define	OPEN	20	/* no	Mark this point in input as start of #n. */
  /*	OPEN+1 is number 1, etc. */
#define	CLOSE	30	/* no	Analogous to OPEN. */

  /*
  * Opcode notes:
  *
  * BRANCH	The set of branches constituting a single choice are hooked
  *		together with their "next" pointers, since precedence prevents
  *		anything being concatenated to any individual branch.  The
  *		"next" pointer of the last BRANCH in a choice points to the
  *		thing following the whole choice.  This is also where the
  *		final "next" pointer of each individual branch points; each
  *		branch starts with the operand node of a BRANCH node.
  *
  * BACK		Normal "next" pointers all implicitly point forward; BACK
  *		exists to make loop structures possible.
  *
  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
  *		BRANCH structures using BACK.  Simple cases (one character
  *		per match) are implemented with STAR and PLUS for speed
  *		and to minimize recursive plunges.
  *
  * OPEN,CLOSE	...are numbered at compile time.
  */

  /*
  * A node is one char of opcode followed by two chars of "next" pointer.
  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
  * value is a positive offset from the opcode of the node containing it.
  * An operand, if any, simply follows the node.  (Note that much of the
  * code generation knows about this implicit relationship.)
  *
  * Using two bytes for the "next" pointer is vast overkill for most things,
  * but allows patterns to get big without disasters.
  */
#define	OP(p)	(*(p))
#define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
#define	OPERAND(p)	((p) + 3)

  /*
  * See regmagic.h for one further detail of program structure.
  */

  /*
  * Utility definitions.
  */
#ifndef CHARBITS
#define	UCHARAT(p)	((int)*(unsigned char *)(p))
#else
#define	UCHARAT(p)	((int)*(p)&CHARBITS)
#endif

#define	FAIL(m)	throw regexp::error(m);
#define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
#define	META	"^$.[()|?+*\\"

  /*
  * Flags to be passed up and down.
  */
#define	HASWIDTH	01	/* Known never to match null string. */
#define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
#define	SPSTART		04	/* Starts with * or +. */
#define	WORST		0	/* Worst case. */

  /*
  - compile - compile a regular expression into internal code
  *
  * We can't allocate space until we know how big the compiled form will be,
  * but we can't compile it (and thus know how big it is) until we've got a
  * place to put the code.  So we cheat:  we compile it twice, once with code
  * generation turned off and size counting turned on, and once "for real".
  * This also means that we don't allocate space until we are sure that the
  * thing really will compile successfully, and we never have to move the
  * code and thus invalidate pointers into it.  (Note that it has to be in
  * one piece because free() must be able to free it all.)
  *
  * Beware that the optimization-preparation code in here knows about some
  * of the structure of the compiled regexp.
  */
  bool
    regexp::compile ( const char* exp )
  {
    clean();

    char *scan;
    char *longest;
    int len;
    int flags;

    if ( exp == NULL )
      FAIL ( "NULL argument" );

    compiler_vars cvars;

    // First pass: determine size, legality.
    cvars.regparse = const_cast<char *> ( exp );
    cvars.regnpar = 1;
    cvars.regsize = 0L;
    cvars.regcode = &regdummy;
    regc ( cvars, MAGIC );
    if ( reg ( cvars, 0, &flags ) == NULL )
      return false;

    // Small enough for pointer-storage convention?
    if ( cvars.regsize >= 32767L )		/* Probably could be 65535L. */
      FAIL ( "regexp too big" );

    // Allocate space.
    program = new char [ cvars.regsize ];
    if ( program == NULL )
      FAIL ( "out of space" );

    // Second pass: emit code.
    cvars.regparse = const_cast<char *> ( exp );
    cvars.regnpar = 1;
    cvars.regcode = program;
    regc ( cvars, MAGIC );
    if ( reg ( cvars, 0, &flags ) == NULL )
      return false;

    // Dig out information for optimizations.
    regstart = '\0';	    // Worst-case defaults.
    reganch = 0;
    regmust = NULL;
    regmlen = 0;
    scan = program + 1;			// First BRANCH.
    if ( OP ( regnext ( scan ) ) == END )
    {
      // Only one top-level choice.
      scan = OPERAND ( scan );

      //* Starting-point info. */
      if ( OP ( scan ) == EXACTLY )
        regstart = *OPERAND ( scan );
      else if ( OP ( scan ) == BOL )
        reganch++;

      /*
      * If there's something expensive in the r.e., find the
      * longest literal string that must appear and make it the
      * regmust.  Resolve ties in favor of later strings, since
      * the regstart check works with the beginning of the r.e.
      * and avoiding duplication strengthens checking.  Not a
      * strong reason, but sufficient in the absence of others.
      */

      if ( flags & SPSTART )
      {
        longest = NULL;
        len = 0;
        for ( ; scan != NULL; scan = regnext ( scan ) )
          if ( OP ( scan ) == EXACTLY
               && int ( strlen ( OPERAND ( scan ) ) ) >= len )
          {
            longest = OPERAND ( scan );
            len = strlen ( OPERAND ( scan ) );
          }
        regmust = const_cast<char *> ( longest );
        regmlen = len;
      }
    }

    return true;
  }


  /*
  - reg - regular expression, i.e. main body or parenthesized thing
  *
  * Caller must absorb opening parenthesis.
  *
  * Combining parenthesis handling with the base level of regular expression
  * is a trifle forced, but the need to tie the tails of the branches to what
  * follows makes it hard to avoid.
  */
  char *
    regexp::reg ( compiler_vars& cvars,
                  int paren, int* flagp ) /* paren - Parenthesized? */
  {
    char *ret;
    char *br;
    char *ender;
    int parno;
    int flags;

    *flagp = HASWIDTH;	// Tentatively.

    // Make an OPEN node, if parenthesized.
    if ( paren )
    {
      if ( cvars.regnpar >= NSUBEXP )
        FAIL ( "too many ()" );
      parno = cvars.regnpar;
      cvars.regnpar++;
      ret = regnode ( cvars, OPEN + parno );
    }
    else
      ret = NULL;

    // Pick up the branches, linking them together.
    br = regbranch ( cvars, &flags );

    if ( br == NULL )
      return ( NULL );

    if ( ret != NULL )
      regtail ( cvars, ret, br ); // OPEN -> first.
    else
      ret = br;

    if ( !( flags & HASWIDTH ) )
      *flagp &= ~HASWIDTH;
    *flagp |= flags & SPSTART;
    while ( *cvars.regparse == '|' )
    {
      cvars.regparse++;
      br = regbranch ( cvars, &flags );
      if ( br == NULL )
        return 0;
      regtail ( cvars, ret, br );   // BRANCH -> BRANCH.
      if ( !( flags & HASWIDTH ) )
        *flagp &= ~HASWIDTH;
      *flagp |= flags & SPSTART;
    }

    // Make a closing node, and hook it on the end.
    ender = regnode ( cvars, ( paren ) ? CLOSE + parno : END );
    regtail ( cvars, ret, ender );

    // Hook the tails of the branches to the closing node.
    for ( br = ret; br != NULL; br = regnext ( br ) )
      regoptail ( cvars, br, ender );

    // Check for proper termination.
    if ( paren && *cvars.regparse++ != ')' )
    {
      FAIL ( "unmatched ()" );
    }
    else if ( !paren && *cvars.regparse != '\0' )
    {
      if ( *cvars.regparse == ')' )
      {
        FAIL ( "unmatched ()" );
      }
      else
        FAIL ( "junk on end" );   // "Can't happen"
      // NOTREACHED
    }
    return ( ret );
  }

  /*
  - regbranch - one alternative of an | operator
  *
  * Implements the concatenation operator.
  */
  char *
    regexp::regbranch ( compiler_vars& cvars, int* flagp )
  {
    register char *ret;
    register char *chain;
    register char *latest;
    int flags;

    *flagp = WORST;		// Tentatively.

    ret = regnode ( cvars, BRANCH );
    chain = NULL;
    while ( *cvars.regparse != '\0' && *cvars.regparse != '|' && *cvars.regparse != ')' )
    {
      latest = regpiece ( cvars, &flags );
      if ( latest == NULL )
        return ( NULL );
      *flagp |= flags & HASWIDTH;
      if ( chain == NULL )	// First piece.
        *flagp |= flags & SPSTART;
      else
        regtail ( cvars, chain, latest );
      chain = latest;
    }
    if ( chain == NULL )  // Loop ran zero times.
      (void) regnode ( cvars, NOTHING );

    return ( ret );
  }


  /*
  - regpiece - something followed by possible [*+?]
  *
  * Note that the branching code sequences used for ? and the general cases
  * of * and + are somewhat optimized:  they use the same NOTHING node as
  * both the endmarker for their branch list and the body of the last branch.
  * It might seem that this node could be dispensed with entirely, but the
  * endmarker role is not redundant.
  */
  char *
    regexp::regpiece ( compiler_vars& cvars, int* flagp )
  {
    register char *ret;
    register char op;
    register char *next;
    int flags;

    ret = regatom ( cvars, &flags );
    if ( ret == NULL )
      return ( NULL );

    op = *cvars.regparse;
    if ( !ISMULT ( op ) )
    {
      *flagp = flags;
      return ( ret );
    }

    if ( !( flags & HASWIDTH ) && op != '?' )
      FAIL ( "*+ operand could be empty" );

    *flagp = ( op != '+' ) ? ( WORST | SPSTART ) : ( WORST | HASWIDTH );

    if ( op == '*' && ( flags & SIMPLE ) )
      reginsert ( cvars, STAR, ret );
    else if ( op == '*' )
    {
      /* Emit x* as (x&|), where & means "self". */
      reginsert ( cvars, BRANCH, ret );                     /* Either x */
      regoptail ( cvars, ret, regnode ( cvars, BACK ) );    /* and loop */
      regoptail ( cvars, ret, ret );                        /* back     */
      regtail   ( cvars, ret, regnode ( cvars, BRANCH ) );  /* or       */
      regtail   ( cvars, ret, regnode ( cvars, NOTHING) );  /* null.    */
    }
    else if ( op == '+' && ( flags & SIMPLE ) )
      reginsert ( cvars, PLUS, ret );
    else if ( op == '+' )
    {
      /* Emit x+ as x(&|), where & means "self". */
      next = regnode ( cvars, BRANCH );                   /* Either */
      regtail ( cvars, ret, next );
      regtail ( cvars, regnode ( cvars,BACK ), ret );     /* loop back */
      regtail ( cvars, next, regnode ( cvars,BRANCH ) );  /* or */
      regtail ( cvars, ret, regnode ( cvars,NOTHING ) );  /* null. */
    }
    else if ( op == '?' )
    {
      /* Emit x? as (x|) */
      reginsert ( cvars, BRANCH, ret );                     /* Either x */
      regtail   ( cvars, ret, regnode ( cvars, BRANCH ) );  /* or */
      next = regnode ( cvars, NOTHING );                    /* null. */
      regtail   ( cvars, ret, next );
      regoptail ( cvars, ret, next );
cs_regexp.cpp - 源码说明

本页面展示了「c-smile 一个语法类似与JS 又有点像C++的编译器」中的 cs_regexp.cpp 源码文件，采用 C++ 编程语言编写，共 1,354 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与c-smile相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?