📄 generator.hpp
字号:
// generator.hpp// Copyright (c) 2007 Ben Hanson (http://www.benhanson.net/)//// Distributed under the Boost Software License, Version 1.0. (See accompanying// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)#ifndef BOOST_LEXER_GENERATOR_HPP#define BOOST_LEXER_GENERATOR_HPP#include "char_traits.hpp"// memcmp()#include <cstring>#include "partition/charset.hpp"#include "partition/equivset.hpp"#include <memory>#include "parser/tree/node.hpp"#include "parser/parser.hpp"#include "containers/ptr_list.hpp"#include "rules.hpp"#include "state_machine.hpp"namespace boost{namespace lexer{template<typename CharT, typename Traits = char_traits<CharT> >class basic_generator{public: typedef typename basic_state_machine<CharT>::size_t_vector size_t_vector; typedef basic_rules<CharT> rules; static void build (const rules &rules_, basic_state_machine<CharT> &state_machine_) { std::size_t index_ = 0; std::size_t size_ = rules_.statemap ().size (); node_ptr_vector node_ptr_vector_; state_machine_.clear (); for (; index_ < size_; ++index_) { state_machine_._lookup->push_back (0); state_machine_._lookup->back () = new size_t_vector; state_machine_._dfa_alphabet.push_back (0); state_machine_._dfa->push_back (0); state_machine_._dfa->back () = new size_t_vector; } for (index_ = 0, size_ = state_machine_._lookup->size (); index_ < size_; ++index_) { state_machine_._lookup[index_]->resize (sizeof (CharT) == 1 ? num_chars : num_wchar_ts, dead_state_index); if (!rules_.regexes ()[index_].empty ()) { // vector mapping token indexes to partitioned token index sets index_set_vector set_mapping_; // syntax tree detail::node *root_ = build_tree (rules_, index_, node_ptr_vector_, state_machine_._lookup[index_], set_mapping_, state_machine_._dfa_alphabet[index_], state_machine_._seen_BOL_assertion, state_machine_._seen_EOL_assertion); build_dfa (root_, set_mapping_, state_machine_._dfa_alphabet[index_], *state_machine_._dfa[index_]); } } } static void minimise (basic_state_machine<CharT> &state_machine_) { const std::size_t machines_ = state_machine_._dfa->size (); for (std::size_t i_ = 0; i_ < machines_; ++i_) { const std::size_t dfa_alphabet_ = state_machine_._dfa_alphabet[i_]; size_t_vector *dfa_ = state_machine_._dfa[i_]; if (dfa_alphabet_ != 0) { std::size_t size_ = 0; do { size_ = dfa_->size (); minimise_dfa (dfa_alphabet_, *dfa_, size_); } while (dfa_->size () != size_); } } }protected: typedef detail::basic_charset<CharT> charset; typedef detail::ptr_list<charset> charset_list; typedef std::auto_ptr<charset> charset_ptr; typedef detail::equivset equivset; typedef detail::ptr_list<equivset> equivset_list; typedef std::auto_ptr<equivset> equivset_ptr; typedef typename charset::index_set index_set; typedef std::vector<index_set> index_set_vector; typedef detail::basic_parser<CharT> parser; typedef typename parser::node_ptr_vector node_ptr_vector; typedef std::set<const detail::node *> node_set; typedef detail::ptr_vector<node_set> node_set_vector; typedef std::vector<const detail::node *> node_vector; typedef detail::ptr_vector<node_vector> node_vector_vector; typedef typename parser::string string; typedef std::pair<string, string> string_pair; typedef typename parser::tokeniser::string_token string_token; typedef std::deque<string_pair> macro_deque; typedef std::pair<string, const detail::node *> macro_pair; typedef typename parser::macro_map::iterator macro_iter; typedef std::pair<macro_iter, bool> macro_iter_pair; typedef typename parser::tokeniser::token_map token_map; static detail::node *build_tree (const rules &rules_, const std::size_t state_, node_ptr_vector &node_ptr_vector_, size_t_vector *lookup_, index_set_vector &set_mapping_, std::size_t &dfa_alphabet_, bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) { const typename rules::string_deque_deque ®exes_ = rules_.regexes (); const typename rules::id_vector_deque &ids_ = rules_.ids (); const typename rules::id_vector_deque &states_ = rules_.states (); typename rules::string_deque::const_iterator regex_iter_ = regexes_[state_].begin (); typename rules::string_deque::const_iterator regex_iter_end_ = regexes_[state_].end (); typename rules::id_vector::const_iterator ids_iter_ = ids_[state_].begin (); typename rules::id_vector::const_iterator states_iter_ = states_[state_].begin (); const typename rules::string ®ex_ = *regex_iter_; // map of regex charset tokens (strings) to index token_map token_map_; const typename rules::string_pair_deque ¯odeque_ = rules_.macrodeque (); typename parser::macro_map macromap_; typename detail::node::node_vector tree_vector_; build_macros (token_map_, macrodeque_, macromap_, rules_.case_sensitive (), rules_.locale (), node_ptr_vector_, rules_.dot_not_newline (), seen_BOL_assertion_, seen_EOL_assertion_); detail::node *root_ = parser::parse (regex_.c_str (), regex_.c_str () + regex_.size (), *ids_iter_, *states_iter_, rules_.case_sensitive (), rules_.dot_not_newline (), rules_.locale (), node_ptr_vector_, macromap_, token_map_, seen_BOL_assertion_, seen_EOL_assertion_); ++regex_iter_; ++ids_iter_; ++states_iter_; tree_vector_.push_back (root_); // build syntax trees while (regex_iter_ != regex_iter_end_) { // re-declare var, otherwise we perform an assignment..! const typename rules::string ®ex_ = *regex_iter_; root_ = parser::parse (regex_.c_str (), regex_.c_str () + regex_.size (), *ids_iter_, *states_iter_, rules_.case_sensitive (), rules_.dot_not_newline (), rules_.locale (), node_ptr_vector_, macromap_, token_map_, seen_BOL_assertion_, seen_EOL_assertion_); tree_vector_.push_back (root_); ++regex_iter_; ++ids_iter_; ++states_iter_; } if (seen_BOL_assertion_) { // Fixup BOLs typename detail::node::node_vector::iterator iter_ = tree_vector_.begin (); typename detail::node::node_vector::iterator end_ = tree_vector_.end (); for (; iter_ != end_; ++iter_) { fixup_bol (*iter_, node_ptr_vector_); } } // join trees { typename detail::node::node_vector::iterator iter_ = tree_vector_.begin (); typename detail::node::node_vector::iterator end_ = tree_vector_.end (); if (iter_ != end_) { root_ = *iter_; ++iter_; } for (; iter_ != end_; ++iter_) { node_ptr_vector_->push_back (0); node_ptr_vector_->back () = new detail::selection_node (root_, *iter_); root_ = node_ptr_vector_->back (); } } // partitioned token list charset_list token_list_; set_mapping_.resize (token_map_.size ()); partition_tokens (token_map_, token_list_); typename charset_list::list::const_iterator iter_ = token_list_->begin (); typename charset_list::list::const_iterator end_ = token_list_->end (); std::size_t index_ = 0; for (; iter_ != end_; ++iter_, ++index_) { const charset *cs_ = *iter_; typename charset::index_set::const_iterator set_iter_ = cs_->_index_set.begin (); typename charset::index_set::const_iterator set_end_ = cs_->_index_set.end (); fill_lookup (cs_->_token, lookup_, index_); for (; set_iter_ != set_end_; ++set_iter_) { set_mapping_[*set_iter_].insert (index_); } } dfa_alphabet_ = token_list_->size () + dfa_offset; return root_; } static void build_macros (token_map &token_map_, const macro_deque ¯odeque_, typename parser::macro_map ¯omap_, const bool case_sensitive_, const std::locale &locale_, node_ptr_vector &node_ptr_vector_, const bool not_dot_newline_, bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) { for (typename macro_deque::const_iterator iter_ = macrodeque_.begin (), end_ = macrodeque_.end (); iter_ != end_; ++iter_) { const typename rules::string &name_ = iter_->first; const typename rules::string ®ex_ = iter_->second; detail::node *node_ = parser::parse (regex_.c_str (), regex_.c_str () + regex_.size (), 0, 0, case_sensitive_, not_dot_newline_, locale_, node_ptr_vector_, macromap_, token_map_, seen_BOL_assertion_, seen_EOL_assertion_); macro_iter_pair map_iter_ = macromap_. insert (macro_pair (name_, 0)); map_iter_.first->second = node_; } } static void build_dfa (detail::node *root_, const index_set_vector &set_mapping_, const std::size_t dfa_alphabet_, size_t_vector &dfa_) { typename detail::node::node_vector *followpos_ = &root_->firstpos (); node_set_vector seen_sets_; node_vector_vector seen_vectors_; size_t_vector hash_vector_; // 'jam' state dfa_.resize (dfa_alphabet_, 0); closure (followpos_, seen_sets_, seen_vectors_, hash_vector_, dfa_alphabet_, dfa_); std::size_t *ptr_ = 0; for (std::size_t index_ = 0; index_ < seen_vectors_->size (); ++index_) { equivset_list equiv_list_; build_equiv_list (seen_vectors_[index_], set_mapping_, equiv_list_); for (typename equivset_list::list::const_iterator iter_ = equiv_list_->begin (), end_ = equiv_list_->end (); iter_ != end_; ++iter_) { equivset *equivset_ = *iter_; const std::size_t transition_ = closure (&equivset_->_followpos, seen_sets_, seen_vectors_, hash_vector_, dfa_alphabet_, dfa_); if (transition_ != npos) { ptr_ = &dfa_.front () + ((index_ + 1) * dfa_alphabet_); // Prune abstemious transitions from end states. if (*ptr_ && !equivset_->_greedy) continue; for (typename detail::equivset::index_vector::const_iterator equiv_iter_ = equivset_->_index_vector.begin (), equiv_end_ = equivset_->_index_vector.end (); equiv_iter_ != equiv_end_; ++equiv_iter_) { const std::size_t index_ = *equiv_iter_; if (index_ == bol_token) { if (ptr_[eol_index] == 0) { ptr_[bol_index] = transition_; } } else if (index_ == eol_token) { if (ptr_[bol_index] == 0) { ptr_[eol_index] = transition_; } } else { ptr_[index_ + dfa_offset] = transition_; } } } } } } static std::size_t closure (typename detail::node::node_vector *followpos_, node_set_vector &seen_sets_, node_vector_vector &seen_vectors_, size_t_vector &hash_vector_, const std::size_t size_, size_t_vector &dfa_) { bool end_state_ = false; std::size_t id_ = 0; std::size_t state_ = 0; std::size_t hash_ = 0; if (followpos_->empty ()) return npos; std::size_t index_ = 0; std::auto_ptr<node_set> set_ptr_ (new node_set); std::auto_ptr<node_vector> vector_ptr_ (new node_vector); for (typename detail::node::node_vector::const_iterator iter_ = followpos_->begin (), end_ = followpos_->end (); iter_ != end_; ++iter_) { closure_ex (*iter_, end_state_, id_, state_, set_ptr_.get (), vector_ptr_.get (), hash_); } bool found_ = false; // Stop VC++ 2005 crashing... if (!hash_vector_.empty ()) { const std::size_t *hash_iter_ = &hash_vector_.front (); const std::size_t *hash_end_ = hash_iter_ + hash_vector_.size (); node_set **set_iter_ = &seen_sets_->front (); for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_) { found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_; ++index_; if (found_) break; } } if (!found_) { seen_sets_->push_back (0); seen_sets_->back () = set_ptr_.release (); seen_vectors_->push_back (0); seen_vectors_->back () = vector_ptr_.release (); hash_vector_.push_back (hash_); // State 0 is the jam state... index_ = seen_sets_->size (); const std::size_t old_size_ = dfa_.size (); dfa_.resize (old_size_ + size_, 0); if (end_state_) { dfa_[old_size_] |= end_state; dfa_[old_size_ + id_index] = id_; dfa_[old_size_ + state_index] = state_; } } return index_; } static void closure_ex (detail::node *node_, bool &end_state_, std::size_t &id_, std::size_t &state_, node_set *set_ptr_, node_vector *vector_ptr_, std::size_t &hash_) { const bool temp_end_state_ = node_->end_state (); if (temp_end_state_) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -