lexer.hpp
字号:
+( (anychar_p - chset<>("\"\\")) | escseq ) ; typedef uint_parser<char_t, 8, 1, std::numeric_limits<char_t>::digits / 3 + 1 > oct_parser_t; typedef uint_parser<char_t, 16, 1, std::numeric_limits<char_t>::digits / 4 + 1 > hex_parser_t; escseq = ch_p('\\') >> ( oct_parser_t() | as_lower_d['x'] >> hex_parser_t() | (anychar_p - chset<>('\n')) ) ;#define BOOST_SLEX_DEBUG (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) BOOST_SPIRIT_DEBUG_TRACE_RULE(regex, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(re, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(series, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton2, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(fullccl, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(string, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(escseq, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl_char, BOOST_SLEX_DEBUG); #undef BOOST_SLEX_DEBUG } rule<ScannerT> const& start() const { return regex; } }; std::stack<node*> &node_stack;}; // class lexer_grammartemplate <typename StringT>inline node *parse(lexer_grammar& g, StringT const& str){ typedef scanner<typename StringT::const_iterator, scanner_policies<> > scanner_t; typedef typename rule<scanner_t>::template result<scanner_t>::type result_t; typename StringT::const_iterator first = str.begin(); typename StringT::const_iterator last = str.end(); scanner_t scan(first, last);// typename rule<scanner_t>::result_t hit = g.parse(scan); result_t hit = g.parse(scan); if (!hit || !scan.at_end()) { while (g.node_stack.size()) { delete g.node_stack.top(); g.node_stack.pop(); } return 0; } BOOST_ASSERT(g.node_stack.size() == 1); node* rval = g.node_stack.top(); g.node_stack.pop(); node* an_eof_node = new eof_node(); rval = new cat_node(rval, an_eof_node); return rval;}inlinevoid make_case_insensitive(state_match_t& state_match){ // TODO: Fix this. // This doesn't take into account foreign languages, figure out how to // do that. Also this approach is broken for this implementation of // wide chars. for (state_match_t::iterator iter = state_match.begin(); iter != state_match.end(); ++iter) { int i, j; for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j) { // if either is set, turn them both on (*iter)[i] = (*iter)[j] = uchar((*iter)[i] | (*iter)[j]); } }}template<bool wide_char>struct regex_match_helper;template<>struct regex_match_helper<false> // single byte char{ template <typename DfaT, typename IteratorT> static bool do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, int& regex_index, std::basic_string< typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type > *token) { typedef std::basic_string< typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type > string_type; typedef typename string_type::size_type size_type; node_id_t s = 0; node_id_t last_accepting_index = invalid_node; IteratorT p = first; IteratorT last_accepting_cpos = first; while (p != last) { s = dfa.transition_table[s][(uchar)*p]; if (s == invalid_node) break; if (token) token->append((size_type)1, *p); ++p; if (dfa.acceptance_index[s] != invalid_node) { last_accepting_index = s; last_accepting_cpos = p; } } if (last_accepting_index != invalid_node) {#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << dfa.acceptance_index[last_accepting_index] << '\n';#endif first = last_accepting_cpos; regex_index = dfa.acceptance_index[last_accepting_index]; return true; } else return false; }};template<>struct regex_match_helper<true> // wide char{ template <typename DfaT, typename IteratorT> static bool do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, int& regex_index, std::basic_string< typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type > *token) { typedef typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type char_t; typedef std::basic_string<char_t> string_type; typedef typename string_type::size_type size_type; node_id_t s = 0; node_id_t last_accepting_index = invalid_node; IteratorT wp = first; IteratorT last_accepting_cpos = first; while (wp != last) { for (unsigned int i = 0; i < sizeof(char_t); ++i) { s = dfa.transition_table[s][get_byte(*wp,i)]; if (s == invalid_node) { goto break_while; } } if (token) token->append((size_type)1, *wp); ++wp; if (dfa.acceptance_index[s] != invalid_node) { last_accepting_index = s; last_accepting_cpos = wp; } } break_while: if (last_accepting_index != invalid_node) {#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << dfa.acceptance_index[last_accepting_index] << '\n';#endif first = last_accepting_cpos; regex_index = dfa.acceptance_index[last_accepting_index]; return true; } else return false; }};template <typename DfaT, typename IteratorT, bool wide_char>struct regex_match{ static bool do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, int& regex_index, std::basic_string< typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type > *token) { return regex_match_helper<wide_char>::do_match( dfa, first, last, regex_index, token); }};} // namespace lexerimpl/////////////////////////////////////////////////////////////////////////////////template <typename IteratorT = char const*, typename TokenT = int, typename CallbackT = void(*)(IteratorT const &, IteratorT &, IteratorT const&, TokenT const&, lexer_control<TokenT> &)>class lexer{public: typedef CallbackT callback_t; typedef typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type char_t; struct regex_info { std::basic_string<char_t> str; TokenT token; CallbackT callback; regex_info(const std::basic_string<char_t>& _str, const TokenT& _token, const CallbackT& _callback) : str(_str) , token(_token) , callback(_callback) {} }; struct dfa_table { std::vector<std::vector<node_id_t> > transition_table; std::vector<node_id_t> acceptance_index; }; typedef std::vector<node_id_t> node_table_t; typedef std::vector<node_table_t> transition_table_t; typedef std::vector<dfa_table> dfa_t; lexer(unsigned int states = 1); void register_regex(const std::basic_string<char_t>& regex, const TokenT& id, const CallbackT& cb = CallbackT(), unsigned int state = 0); TokenT next_token(IteratorT &first, IteratorT const &last, std::basic_string<char_t> *token = 0); void create_dfa(); bool has_compiled_dfa() { return m_compiled_dfa; } void set_case_insensitive(bool insensitive);#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) void dump(std::ostream& out);#endif typedef std::vector<std::vector<regex_info> > regex_list_t; bool load (std::ifstream &in, long unique_id = 0); bool save (std::ofstream &out, long unique_id = 0); enum { SLEX_SIGNATURE = 0x58454C53, // "SLEX" SLEX_VERSION_100 = 0x0100, // persistance version SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100, SLEX_MINOR_VERSION_MASK = 0xFF };private: void create_dfa_for_state(int state); static bool regex_match(const dfa_t& dfa, IteratorT& first, IteratorT& last, int& regex_index); mutable std::stack<lexerimpl::node*> node_stack; lexerimpl::lexer_grammar g; mutable bool m_compiled_dfa; mutable dfa_t m_dfa; regex_list_t m_regex_list; bool m_case_insensitive; unsigned int m_state; std::stack<unsigned int> m_state_stack; unsigned int m_num_states;};template <typename IteratorT, typename TokenT, typename CallbackT>inlinelexer<IteratorT, TokenT, CallbackT>::lexer(unsigned int states) : g(node_stack) , m_compiled_dfa(false) , m_regex_list(states) , m_case_insensitive(false) , m_state(0) , m_state_stack() , m_num_states(states){ BOOST_SPIRIT_DEBUG_TRACE_NODE_NAME(g, "slex::lexer", BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX);}template <typename IteratorT, typename TokenT, typename CallbackT>inline voidlexer<IteratorT, TokenT, CallbackT>::register_regex( const std::basic_string<char_t>& regex, const TokenT& id, const CallbackT& callback, unsigned int state){ if (state > m_num_states) { m_regex_list.resize(state); m_num_states = state; } m_regex_list[state].push_back(regex_info(regex, id, callback));}template <typename IteratorT, typename TokenT, typename CallbackT>inline TokenTlexer<IteratorT, TokenT, CallbackT>::next_token( IteratorT &first, IteratorT const& last, std::basic_string< typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type > *token){ if (!m_compiled_dfa) { create_dfa(); } IteratorT saved = first; int regex_index; if (!lexerimpl::regex_match<dfa_table, IteratorT, (sizeof(char_t) > 1)>:: do_match(m_dfa[m_state], first, last, regex_index, token)) return -1; // TODO: can't return -1, need to return some invalid token. // how to figure this out? We can use traits I guess. else { regex_info regex = m_regex_list[m_state][regex_index]; TokenT rval = regex.token; if (regex.callback) { // execute corresponding callback lexer_control<TokenT> controller(rval, m_state, m_state_stack); regex.callback(saved, first, last, regex.token, controller); if (controller.ignore_current_token_set()) { if (token) token->erase(); return next_token(first, last, token); } } return rval; }}namespace lexerimpl{inlinebool find_acceptance_state(const node_set& eof_node_ids, const node_set& current_set, node_id_t& acceptance_node_id){ for(node_set::const_iterator nsi = eof_node_ids.begin(); nsi != eof_node_ids.end(); ++nsi) { node_id_t eof_node_id =*nsi; if (current_set.end() != current_set.find(eof_node_id)) { // store the first one we come to as the // matched pattern acceptance_node_id = eof_node_id; // don't bother searching for more return true; } } return false;}template <typename RegexListT, typename GrammarT>inline std::auto_ptr<node>parse_regexes(const RegexListT& regex_list, GrammarT& g){ // parse the expressions into a tree if (regex_list.begin() == regex_list.end()) boost::throw_exception(bad_regex()); typename RegexListT::const_iterator ri = regex_list.begin(); std::auto_ptr<node> tree(lexerimpl::parse(g, (*ri).str)); if (tree.get() == 0) boost::throw_exception(bad_regex()); ++ri; for (/**/; ri != regex_list.end(); ++ri) { std::auto_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str)); if (next_tree.get() == 0) boost::throw_exception(bad_regex()); std::auto_ptr<node> newnode(new or_node(tree.release(), next_tree.release())); tree = newnode; } return tree;}} //namespace lexerimpltemplate <typename IteratorT, typename TokenT, typename CallbackT>inline voidlexer<IteratorT, TokenT, CallbackT>::create_dfa(){ m_dfa.resize(m_num_states); for (unsigned int i = 0; i < m_num_states; ++i) create_dfa_for_state(i);}// Algorithm from Compilers: Principles, Techniques, and Tools p. 141template <typename IteratorT, typename TokenT, typename CallbackT>inline voidlexer<IteratorT, TokenT, CallbackT>::create_dfa_for_state(int state){ using lexerimpl::node; std::auto_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g); node_id_t dummy = 0; tree->assign_node_ids(dummy);#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) tree->dump(std::cout);#endif // compute followpos(root) followpos_t followpos; tree->compute_followpos(followpos); // the dfa states <-> nfa state groups std::map<node_set, node_id_t> dstates1; std::map<node_id_t, node_set> dstates2; // the dfa transitions m_dfa[state].transition_table.push_back( std::vector<node_id_t>(256, invalid_node)); m_dfa[state].acceptance_index.push_back(invalid_node); // whether the dfa state has been processed yet std::vector<node_id_t> marked; // used to give a unique id to each dfa state node_id_t num_states = 0; // initially, the only unmarked state in Dstates is firstpos(root). marked.push_back(0); node_set fpr = tree->firstpos(); dstates1[fpr] = 0; dstates2[0] = fpr; state_match_t state_match; tree->compute_state_match(state_match); if (m_case_insensitive) lexerimpl::make_case_insensitive(state_match); node_set eof_node_ids; tree->get_eof_ids(eof_node_ids); // translate the eof_node_ids into a 0-based index std::map<node_id_t, node_id_t> eof_node_id_map; unsigned int x = 0; for (node_set::iterator node_id_it = eof_node_ids.begin(); node_id_it != eof_node_ids.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -