basic_regex_parser.hpp

来自「support vector clustering for vc++」· HPP 代码 · 共 2,007 行 · 第 1/5 页

HPP
2,007
字号
   // we either have a ')' or we have run out of characters prematurely:
   //
   if(m_position == m_end)
   {
      this->fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_end));
      return false;
   }
   BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
   ++m_position;
   //
   // append closing parenthesis state:
   //
   pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
   pb->index = markid;
   this->m_paren_start = last_paren_start;
   //
   // restore the alternate insertion point:
   //
   this->m_alt_insert_point = last_alt_point;
   //
   // allow backrefs to this mark:
   //
   if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT))
      this->m_backrefs |= 1u << (markid - 1);

   return true;
}

template <class charT, class traits>
bool basic_regex_parser<charT, traits>::parse_basic_escape()
{
   ++m_position;
   bool result = true;
   switch(this->m_traits.escape_syntax_type(*m_position))
   {
   case regex_constants::syntax_open_mark:
      return parse_open_paren();
   case regex_constants::syntax_close_mark:
      return false;
   case regex_constants::syntax_plus:
      if(this->flags() & regex_constants::bk_plus_qm)
      {
         ++m_position;
         return parse_repeat(1);
      }
      else
         return parse_literal();
   case regex_constants::syntax_question:
      if(this->flags() & regex_constants::bk_plus_qm)
      {
         ++m_position;
         return parse_repeat(0, 1);
      }
      else
         return parse_literal();
   case regex_constants::syntax_open_brace:
      if(this->flags() & regbase::no_intervals)
         return parse_literal();
      ++m_position;
      return parse_repeat_range(true);
   case regex_constants::syntax_close_brace:
      if(this->flags() & regbase::no_intervals)
         return parse_literal();
      fail(regex_constants::error_brace, this->m_position - this->m_base);
      return false;
   case regex_constants::syntax_or:
      if(this->flags() & regbase::bk_vbar)
         return parse_alt();
      else
         result = parse_literal();
      break;
   case regex_constants::syntax_digit:
      return parse_backref();
   case regex_constants::escape_type_start_buffer:
      if(this->flags() & regbase::emacs_ex)
      {
         ++m_position;
         this->append_state(syntax_element_buffer_start);
      }
      else
         result = parse_literal();
      break;
   case regex_constants::escape_type_end_buffer:
      if(this->flags() & regbase::emacs_ex)
      {
         ++m_position;
         this->append_state(syntax_element_buffer_end);
      }
      else
         result = parse_literal();
      break;
   case regex_constants::escape_type_word_assert:
      if(this->flags() & regbase::emacs_ex)
      {
         ++m_position;
         this->append_state(syntax_element_word_boundary);
      }
      else
         result = parse_literal();
      break;
   case regex_constants::escape_type_not_word_assert:
      if(this->flags() & regbase::emacs_ex)
      {
         ++m_position;
         this->append_state(syntax_element_within_word);
      }
      else
         result = parse_literal();
      break;
   case regex_constants::escape_type_left_word:
      if(this->flags() & regbase::emacs_ex)
      {
         ++m_position;
         this->append_state(syntax_element_word_start);
      }
      else
         result = parse_literal();
      break;
   case regex_constants::escape_type_right_word:
      if(this->flags() & regbase::emacs_ex)
      {
         ++m_position;
         this->append_state(syntax_element_word_end);
      }
      else
         result = parse_literal();
      break;
   default:
      if(this->flags() & regbase::emacs_ex)
      {
         bool negate = true;
         switch(*m_position)
         {
         case 'w':
            negate = false;
            // fall through:
         case 'W':
            {
            basic_char_set<charT, traits> char_set;
            if(negate)
               char_set.negate();
            char_set.add_class(this->m_word_mask);
            if(0 == this->append_set(char_set))
            {
               fail(regex_constants::error_ctype, m_position - m_base);
               return false;
            }
            ++m_position;
            return true;
            }
         case 's':
            negate = false;
            // fall through:
         case 'S':
            return add_emacs_code(negate);
         case 'c':
         case 'C':
            // not supported yet:
            fail(regex_constants::error_escape, m_position - m_base);
            return false;
         default:
            break;
         }
      }
      result = parse_literal();
      break;
   }
   return result;
}

template <class charT, class traits>
bool basic_regex_parser<charT, traits>::parse_extended_escape()
{
   ++m_position;
   bool negate = false; // in case this is a character class escape: \w \d etc
   switch(this->m_traits.escape_syntax_type(*m_position))
   {
   case regex_constants::escape_type_not_class:
      negate = true;
      // fall through:
   case regex_constants::escape_type_class:
      {
         typedef typename traits::char_class_type mask_type;
         mask_type m = this->m_traits.lookup_classname(m_position, m_position+1);
         if(m != 0)
         {
            basic_char_set<charT, traits> char_set;
            if(negate)
               char_set.negate();
            char_set.add_class(m);
            if(0 == this->append_set(char_set))
            {
               fail(regex_constants::error_ctype, m_position - m_base);
               return false;
            }
            ++m_position;
            return true;
         }
         //
         // not a class, just a regular unknown escape:
         //
         this->append_literal(unescape_character());
         break;
      }
   case regex_constants::syntax_digit:
      return parse_backref();
   case regex_constants::escape_type_left_word:
      ++m_position;
      this->append_state(syntax_element_word_start);
      break;
   case regex_constants::escape_type_right_word:
      ++m_position;
      this->append_state(syntax_element_word_end);
      break;
   case regex_constants::escape_type_start_buffer:
      ++m_position;
      this->append_state(syntax_element_buffer_start);
      break;
   case regex_constants::escape_type_end_buffer:
      ++m_position;
      this->append_state(syntax_element_buffer_end);
      break;
   case regex_constants::escape_type_word_assert:
      ++m_position;
      this->append_state(syntax_element_word_boundary);
      break;
   case regex_constants::escape_type_not_word_assert:
      ++m_position;
      this->append_state(syntax_element_within_word);
      break;
   case regex_constants::escape_type_Z:
      ++m_position;
      this->append_state(syntax_element_soft_buffer_end);
      break;
   case regex_constants::escape_type_Q:
      return parse_QE();
   case regex_constants::escape_type_C:
      return parse_match_any();
   case regex_constants::escape_type_X:
      ++m_position;
      this->append_state(syntax_element_combining);
      break;
   case regex_constants::escape_type_G:
      ++m_position;
      this->append_state(syntax_element_restart_continue);
      break;
   case regex_constants::escape_type_not_property:
      negate = true;
      // fall through:
   case regex_constants::escape_type_property:
      {
         ++m_position;
         char_class_type m;
         if(m_position == m_end)
         {
            fail(regex_constants::error_escape, m_position - m_base);
            return false;
         }
         // maybe have \p{ddd}
         if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
         {
            const charT* base = m_position;
            // skip forward until we find enclosing brace:
            while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
               ++m_position;
            if(m_position == m_end)
            {
               fail(regex_constants::error_escape, m_position - m_base);
               return false;
            }
            m = this->m_traits.lookup_classname(++base, m_position++);
         }
         else
         {
            m = this->m_traits.lookup_classname(m_position, m_position+1);
            ++m_position;
         }
         if(m != 0)
         {
            basic_char_set<charT, traits> char_set;
            if(negate)
               char_set.negate();
            char_set.add_class(m);
            if(0 == this->append_set(char_set))
            {
               fail(regex_constants::error_ctype, m_position - m_base);
               return false;
            }
            return true;
         }
         fail(regex_constants::error_ctype, m_position - m_base);
      }
   default:
      this->append_literal(unescape_character());
      break;
   }
   return true;
}

template <class charT, class traits>
bool basic_regex_parser<charT, traits>::parse_match_any()
{
   //
   // we have a '.' that can match any character:
   //
   ++m_position;
   static_cast<re_dot*>(
      this->append_state(syntax_element_wild, sizeof(re_dot))
      )->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s 
      ? re_detail::force_not_newline 
         : this->flags() & regbase::mod_s ?
            re_detail::force_newline : re_detail::dont_care);
   return true;
}

template <class charT, class traits>
bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
{
   bool greedy = true;
   std::size_t insert_point;
   // 
   // when we get to here we may have a non-greedy ? mark still to come:
   //
   if((m_position != m_end) 
      && (
            (0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
            || ((regbase::basic_syntax_group|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type | regbase::emacs_ex)))
         )
      )
   {
      // OK we have a perl regex, check for a '?':
      if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
      {
         greedy = false;
         ++m_position;
      }
   }
   if(0 == this->m_last_state)
   {
      fail(regex_constants::error_badrepeat, ::boost::re_detail::distance(m_base, m_position));
      return false;
   }
   if(this->m_last_state->type == syntax_element_endmark)
   {
      // insert a repeat before the '(' matching the last ')':
      insert_point = this->m_paren_start;
   }
   else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1))
   {
      // the last state was a literal with more than one character, split it in two:
      re_literal* lit = static_cast<re_literal*>(this->m_last_state);
      charT c = (static_cast<charT*>(static_cast<void*>(lit+1)))[lit->length - 1];
      --(lit->length);
      // now append new state:
      lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
      lit->length = 1;
      (static_cast<charT*>(static_cast<void*>(lit+1)))[0] = c;
      insert_point = this->getoffset(this->m_last_state);
   }
   else
   {
      // repeat the last state whatever it was, need to add some error checking here:
      switch(this->m_last_state->type)
      {
      case syntax_element_start_line:
      case syntax_element_end_line:
      case syntax_element_word_boundary:
      case syntax_element_within_word:
      case syntax_element_word_start:
      case syntax_element_word_end:
      case syntax_element_buffer_start:
      case syntax_element_buffer_end:
      case syntax_element_alt:
      case syntax_element_soft_buffer_end:
      case syntax_element_restart_continue:
      case syntax_element_jump:
      case syntax_element_startmark:
         // can't legally repeat any of the above:
         fail(regex_constants::error_badrepeat, m_position - m_base);
         return false;
      default:
         // do nothing...
         break;
      }
      insert_point = this->getoffset(this->m_last_state);
   }
   //
   // OK we now know what to repeat, so insert the repeat around it:
   //
   re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size));
   rep->min = low;
   rep->max = high;
   rep->greedy = greedy;
   rep->leading = false;
   // store our repeater position for later:
   std::ptrdiff_t rep_off = this->getoffset(rep);
   // and append a back jump to the repeat:
   re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump)));
   jmp->alt.i = rep_off - this->getoffset(jmp);
   this->m_pdata->m_data.align();
   // now fill in the alt jump for the repeat:
   rep = static_cast<re_repeat*>(this->getaddress(rep_off));

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?