⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cs_regexp.cpp

📁 c-smile 一个语法类似与JS 又有点像C++的 编译器
💻 CPP
📖 第 1 页 / 共 3 页
字号:
    }
    cvars.regparse++;
    if ( ISMULT ( *cvars.regparse ) )
      FAIL ( "nested *?+" );

    return ( ret );
  }


  /*
  - regatom - the lowest level
  *
  * Optimization:  gobbles an entire sequence of ordinary characters so that
  * it can turn them into a single node, which is smaller to store and
  * faster to run.  Backslashed characters are exceptions, each becoming a
  * separate node; the code is simpler that way and it's not worth fixing.
  */
  char *
    regexp::regatom ( compiler_vars& cvars, int* flagp )
  {
    register char *ret;
    int flags;

    *flagp = WORST;		/* Tentatively. */

    switch ( *cvars.regparse++ )
    {
    case '^':
      ret = regnode ( cvars, BOL );
      break;
    case '$':
      ret = regnode ( cvars, EOL );
      break;
    case '.':
      ret = regnode ( cvars, ANY );
      *flagp |= HASWIDTH | SIMPLE;
      break;
    case '[':
      {
        int rclass;
        int classend;

        if ( *cvars.regparse == '^' )
        {
          /* Complement of range. */
          ret = regnode ( cvars, ANYBUT );
          cvars.regparse++;
        }
        else
          ret = regnode ( cvars, ANYOF );
        if ( *cvars.regparse == ']' || *cvars.regparse == '-' )
          regc ( cvars, *cvars.regparse++ );
        while ( *cvars.regparse != '\0' && *cvars.regparse != ']' )
        {
          if ( *cvars.regparse == '-' )
          {
            cvars.regparse++;
            if ( *cvars.regparse == ']' || *cvars.regparse == '\0' )
              regc ( cvars, '-' );
            else
            {
              rclass   = UCHARAT ( cvars.regparse - 2 ) + 1;
              classend = UCHARAT ( cvars.regparse );
              if ( rclass > classend + 1 )
                FAIL ( "invalid [] range" );
              for ( ; rclass <= classend; rclass++ )
                regc ( cvars, rclass );
              cvars.regparse++;
            }
          }
          else
            regc ( cvars, *cvars.regparse++ );
        }
        regc ( cvars, '\0' );
        if ( *cvars.regparse != ']' )
          FAIL ( "unmatched []" );
        cvars.regparse++;
        *flagp |= HASWIDTH | SIMPLE;
      }
      break;
    case '(':
      ret = reg ( cvars, 1, &flags );
      if ( ret == NULL )
        return ( NULL );
      *flagp |= flags & ( HASWIDTH | SPSTART );
      break;
    case '\0':
    case '|':
    case ')':
      FAIL ( "internal urp" );  /* Supposed to be caught earlier. */
      break;
    case '?':
    case '+':
    case '*':
      FAIL ( "?+* follows nothing" );
      break;
    case '\\':
      if ( *cvars.regparse == '\0' )
        FAIL ( "trailing \\" );
      ret = regnode ( cvars, EXACTLY );
      regc ( cvars, *cvars.regparse++ );
      regc ( cvars, '\0' );
      *flagp |= HASWIDTH | SIMPLE;
      break;
    default:
      {
        register int len;
        register char ender;

        cvars.regparse--;
        len = strcspn ( cvars.regparse, META );
        if ( len <= 0 )
          FAIL ( "internal disaster" );
        ender = *( cvars.regparse + len );
        if ( len > 1 && ISMULT ( ender ) )
          len--;		/* Back off clear of ?+* operand. */
        *flagp |= HASWIDTH;
        if ( len == 1 )
          *flagp |= SIMPLE;
        ret = regnode ( cvars, EXACTLY );
        while ( len > 0 )
        {
          regc ( cvars, *cvars.regparse++ );
          len--;
        }
        regc ( cvars, '\0' );
      }
      break;
    }

    return ( ret );
  }


  /*
  - regnode - emit a node
  */
  char *	/* Location. */
    regexp::regnode ( compiler_vars& cvars, char op )
  {
    register char *ret;
    register char *ptr;

    ret = cvars.regcode;
    if ( ret == &regdummy )
    {
      cvars.regsize += 3;
      return ( ret );
    }

    ptr = ret;
    *ptr++ = op;
    *ptr++ = '\0';		/* Null "next" pointer. */
    *ptr++ = '\0';
    cvars.regcode = ptr;

    return ( ret );
  }


  /*
  - regc - emit (if appropriate) a byte of code
  */
  void
    regexp::regc ( compiler_vars& cvars, char b )
  {
    if ( cvars.regcode != &regdummy )
      *cvars.regcode++ = b;
    else
      cvars.regsize++;
  }


  /*
  - reginsert - insert an operator in front of already-emitted operand
  *
  * Means relocating the operand.
  */
  void
    regexp::reginsert ( compiler_vars& cvars, char op, char* opnd )
  {
    register char *src;
    register char *dst;
    register char *place;

    if ( cvars.regcode == &regdummy )
    {
      cvars.regsize += 3;
      return;
    }

    src = cvars.regcode;
    cvars.regcode += 3;
    dst = cvars.regcode;
    while ( src > opnd )
      *--dst = *--src;

    place    = opnd;		/* Op node, where operand used to be. */
    *place++ = op;
    *place++ = '\0';
    *place++ = '\0';
  }


  /*
  - regtail - set the next-pointer at the end of a node chain
  */
  void
    regexp::regtail ( compiler_vars& cvars, char* p, char* val )
  {
    register char *scan;
    register char *temp;
    register int offset;

    if ( p == &regdummy )
      return;

    /* Find last node. */
    scan = p;
    for (;;)
    {
      temp = regnext ( scan );
      if ( temp == NULL )
        break;
      scan = temp;
    }

    if ( OP ( scan ) == BACK )
      offset = scan - val;
    else
      offset = val - scan;
    *( scan + 1 ) = ( offset >> 8 ) & 0377;
    *( scan + 2 ) = offset & 0377;
  }


  /*
  - regoptail - regtail on operand of first argument; nop if operandless
  */
  void
    regexp::regoptail ( compiler_vars& cvars, char* p,  char* val )
  {
    /* "Operandless" and "op != BRANCH" are synonymous in practice. */
    if ( p == NULL || p == &regdummy || OP ( p ) != BRANCH )
      return;
    regtail ( cvars, OPERAND ( p ), val );
  }


  /*
  * regexec and friends
  */

#ifdef DEBUG
  int   regnarrate = 0;
  void  regdump ( regexp* r );
  char* regprop ( char* op  );
#endif


  /*
  - regexec - match a regexp against a string
  */
  bool
    regexp::exec ( const char* string )
  {
    char *s;

    // Be paranoid...
    if ( string == NULL )
    {
      FAIL ( "NULL parameter" );
      return false;
    }

    exec_vars evars;

    // Check validity of program.
    if ( UCHARAT ( program ) != (unsigned char) MAGIC )
    {
      FAIL ( "corrupted program" );
      return false;
    }

    // If there is a "must appear" string, look for it.
    if ( regmust != NULL )
    {
      s = const_cast<char *> ( string );
      while ( ( s = strchr ( s, regmust [ 0 ] ) ) != NULL )
      {
        if ( strncmp ( s, regmust, regmlen ) == 0 )
          break;	// Found it.
        s++;
      }
      if ( s == NULL )  // Not present.
        return false;
    }

    // Mark beginning of line for ^ .
    evars.regbol = const_cast<char *> ( string );

    // Simplest case:  anchored match need be tried only once.
    if ( reganch )
      return ( regtry ( evars, const_cast<char *> ( string ) ) != 0 );

    // Messy cases:  unanchored match.
    s = const_cast<char *> ( string );
    if ( regstart != '\0' )
      // We know what char it must start with.
      while ( ( s = strchr ( s, regstart ) ) != NULL )
      {
        if ( regtry ( evars, s ) )
          return true;
        s++;
      }
      else
       // We don't -- general case.
        do
        {
          if ( regtry ( evars, s ) )
            return true;
        }
        while ( *s++ != '\0' );

    /* Failure. */
    return false;
  }

  /*
  - regtry - try match at specific point
  */
  int			/* 0 failure, 1 success */
    regexp::regtry ( exec_vars& evars, char* string )
  {
    int i;
    char **sp;
    char **ep;

    evars.reginput  = const_cast<char *> ( string );
    evars.regstartp = startp;
    evars.regendp   = endp;

    sp = startp;
    ep = endp;
    for ( i = NSUBEXP; i > 0; i-- )
    {
      *sp++ = NULL;
      *ep++ = NULL;
    }
    if ( regmatch ( evars, program + 1 ) )
    {
      startp [ 0 ] = const_cast<char *> ( string );
      endp   [ 0 ] = evars.reginput;
      return ( 1 );
    }
    else
      return ( 0 );
  }


  /*
  - regmatch - main matching routine
  *
  * Conceptually the strategy is simple:  check to see whether the current
  * node matches, call self recursively to see whether the rest matches,
  * and then act accordingly.  In practice we make some effort to avoid
  * recursion, in particular by going through "ordinary" nodes (that don't
  * need to know whether the rest of the match failed) by a loop instead of
  * by recursion.
  */
  int			/* 0 failure, 1 success */
    regexp::regmatch ( exec_vars& evars, char* prog )
  {
    char *scan;	/* Current node. */
    char *next; /* Next node. */

    scan = prog;
#ifdef DEBUG
    if ( scan != NULL && regnarrate )
      fprintf ( stderr, "%s(\n", regprop ( scan ) );
#endif
    while ( scan != NULL )
    {
#ifdef DEBUG
      if ( regnarrate )
        fprintf ( stderr, "%s...\n", regprop ( scan ) );
#endif
      next = regnext ( scan );

      switch ( OP ( scan ) )
      {
      case BOL:
        if ( evars.reginput != evars.regbol )
          return ( 0 );
        break;
      case EOL:
        if ( *evars.reginput != '\0' )
          return ( 0 );
        break;
      case ANY:
        if ( *evars.reginput == '\0' )
          return ( 0 );
        evars.reginput++;
        break;
      case EXACTLY:
        {
          int len;
          const char *opnd;

          opnd = OPERAND ( scan );
          /* Inline the first character, for speed. */
          if ( *opnd != *evars.reginput )
            return ( 0 );
          len = strlen ( opnd );
          if ( len > 1 && strncmp ( opnd, evars.reginput, len ) != 0 )
            return ( 0 );
          evars.reginput += len;
        }
        break;
      case ANYOF:
        if ( *evars.reginput == '\0' ||
             strchr ( OPERAND ( scan ), *evars.reginput ) == NULL )
          return ( 0 );
        evars.reginput++;
        break;
      case ANYBUT:
        if ( *evars.reginput == '\0' ||
             strchr ( OPERAND ( scan ), *evars.reginput ) != NULL )
          return ( 0 );
        evars.reginput++;
        break;
      case NOTHING:
        break;
      case BACK:
        break;
      case OPEN + 1:
      case OPEN + 2:
      case OPEN + 3:
      case OPEN + 4:
      case OPEN + 5:
      case OPEN + 6:
      case OPEN + 7:
      case OPEN + 8:
      case OPEN + 9:
        {
          register int no;
          register char *save;

          no = OP ( scan ) - OPEN;
          save = evars.reginput;

          if ( regmatch ( evars, next ) )

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -