📄 parser.hpp
字号:
// parser.hpp// Copyright (c) 2007 Ben Hanson (http://www.benhanson.net/)//// Distributed under the Boost Software License, Version 1.0. (See accompanying// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)#ifndef BOOST_LEXER_PARSER_HPP#define BOOST_LEXER_PARSER_HPP#include <assert.h>#include "tree/end_node.hpp"#include "tree/iteration_node.hpp"#include "tree/leaf_node.hpp"#include "../runtime_error.hpp"#include "tree/selection_node.hpp"#include "tree/sequence_node.hpp"#include "../size_t.hpp"#include "tokeniser/re_tokeniser.hpp"namespace boost{namespace lexer{namespace detail{template<typename CharT>class basic_parser{public: typedef basic_re_tokeniser<CharT> tokeniser; typedef typename tokeniser::string string; typedef std::map<string, const node *> macro_map; typedef node::node_ptr_vector node_ptr_vector; typedef typename tokeniser::num_token token;/* General principles of regex parsing: - Every regex is a sequence of sub-regexes. - Regexes consist of operands and operators - All operators decompose to sequence, selection ('|') and iteration ('*') - Regex tokens are stored on the stack. - When a complete sequence of regex tokens is on the stack it is processed.Grammar:<REGEX> -> <OREXP><OREXP> -> <SEQUENCE> | <OREXP>'|'<SEQUENCE><SEQUENCE> -> <SUB><SUB> -> <EXPRESSION> | <SUB><EXPRESSION><EXPRESSION> -> <REPEAT><REPEAT> -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE><DUPLICATE> -> '?' | '*' | '+' | '{n[,[m]]}'*/ static node *parse (const CharT *start_, const CharT * const end_, const std::size_t id_, const std::size_t dfa_state_, const bool case_sensitive_, const bool dot_not_newline_, const std::locale &locale_, node_ptr_vector &node_ptr_vector_, const macro_map ¯omap_, typename tokeniser::token_map &map_, bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) { node *root_ = 0; state state_ (start_, end_, case_sensitive_, locale_, dot_not_newline_); token lhs_token_; token rhs_token_; token_stack token_stack_; tree_node_stack tree_node_stack_; char action_ = 0; token_stack_.push (rhs_token_); tokeniser::next (state_, map_, rhs_token_); do { lhs_token_ = token_stack_.top (); action_ = lhs_token_.precedence (rhs_token_._type); switch (action_) { case '<': case '=': token_stack_.push (rhs_token_); tokeniser::next (state_, map_, rhs_token_); break; case '>': reduce (token_stack_, macromap_, node_ptr_vector_, tree_node_stack_); break; default: std::ostringstream ss_; ss_ << "A syntax error occurred: '" << lhs_token_.precedence_string () << "' against '" << rhs_token_.precedence_string () << "' at index " << state_._index << "."; throw runtime_error (ss_.str ().c_str ()); break; } } while (!token_stack_.empty ()); if (tree_node_stack_.empty ()) { throw runtime_error ("Empty rules are not allowed."); } assert (tree_node_stack_.size () == 1); node *lhs_node_ = tree_node_stack_.top (); tree_node_stack_.pop (); if (id_ == 0) { // Macros have no end state... root_ = lhs_node_; } else { node_ptr_vector_->push_back (0); node *rhs_node_ = new end_node (id_, dfa_state_); node_ptr_vector_->back () = rhs_node_; node_ptr_vector_->push_back (0); node_ptr_vector_->back () = new sequence_node (lhs_node_, rhs_node_); root_ = node_ptr_vector_->back (); } // Done this way as bug in VC++ 6 prevents |= operator working // properly! if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true; if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true; return root_; }private: typedef typename tokeniser::state state; typedef std::stack<token> token_stack; typedef node::node_stack tree_node_stack; static void reduce (token_stack &token_stack_, const macro_map ¯omap_, node_ptr_vector &node_vector_ptr_, tree_node_stack &tree_node_stack_) { typename tokeniser::num_token lhs_; typename tokeniser::num_token rhs_; token_stack handle_; char action_ = 0; do { rhs_ = token_stack_.top (); token_stack_.pop (); handle_.push (rhs_); if (!token_stack_.empty ()) { lhs_ = token_stack_.top (); action_ = lhs_.precedence (rhs_._type); } } while (!token_stack_.empty () && action_ == '='); assert (token_stack_.empty () || action_ == '<'); switch (rhs_._type) { case token::BEGIN: // finished processing so exit break; case token::REGEX: // finished parsing, nothing to do break; case token::OREXP: orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); break; case token::SEQUENCE: token_stack_.push (token::OREXP); break; case token::SUB: sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); break; case token::EXPRESSION: token_stack_.push (token::SUB); break; case token::REPEAT: repeat (handle_, token_stack_); break; case token::CHARSET: charset (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); break; case token::MACRO: macro (handle_, token_stack_, macromap_, node_vector_ptr_, tree_node_stack_); break; case token::OPENPAREN: openparen (handle_, token_stack_); break; case token::OPT: case token::AOPT: optional (rhs_._type == token::OPT, node_vector_ptr_, tree_node_stack_); token_stack_.push (token::DUP); break; case token::ZEROORMORE: case token::AZEROORMORE: zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_, tree_node_stack_); token_stack_.push (token::DUP); break; case token::ONEORMORE: case token::AONEORMORE: one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_, tree_node_stack_); token_stack_.push (token::DUP); break; case token::REPEATN: case token::AREPEATN: repeatn (rhs_._type == token::REPEATN, handle_.top (), node_vector_ptr_, tree_node_stack_); token_stack_.push (token::DUP); break; default: throw runtime_error ("Internal error regex_parser::reduce"); break; } } static void orexp (token_stack &handle_, token_stack &token_stack_, node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { assert (handle_.top ()._type == token::OREXP && (handle_.size () == 1 || handle_.size () == 3)); if (handle_.size () == 1) { token_stack_.push (token::REGEX); } else { handle_.pop (); assert (handle_.top ()._type == token::OR); handle_.pop (); assert (handle_.top ()._type == token::SEQUENCE); perform_or (node_ptr_vector_, tree_node_stack_); token_stack_.push (token::OREXP); } } static void sub (token_stack &handle_, token_stack &token_stack_, node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { assert (handle_.top ()._type == token::SUB && handle_.size () == 1 || handle_.size () == 2); if (handle_.size () == 1) { token_stack_.push (token::SEQUENCE); } else
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -