lexer.hpp

Boost provides free peer-reviewed portable C++ source libraries. We emphasize libraries that work
HPP
第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
                    +( (anychar_p - chset<>("\"\\")) | escseq )                ;            typedef                uint_parser<char_t, 8, 1,                    std::numeric_limits<char_t>::digits / 3 + 1                > oct_parser_t;            typedef                uint_parser<char_t, 16, 1,                    std::numeric_limits<char_t>::digits / 4 + 1                > hex_parser_t;            escseq =                    ch_p('\\')                    >>  (                            oct_parser_t()                        |   as_lower_d['x'] >> hex_parser_t()                        |   (anychar_p - chset<>('\n'))                        )                ;#define BOOST_SLEX_DEBUG (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)            BOOST_SPIRIT_DEBUG_TRACE_RULE(regex, BOOST_SLEX_DEBUG);            BOOST_SPIRIT_DEBUG_TRACE_RULE(re, BOOST_SLEX_DEBUG);            BOOST_SPIRIT_DEBUG_TRACE_RULE(series, BOOST_SLEX_DEBUG);            BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton, BOOST_SLEX_DEBUG);            BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton2, BOOST_SLEX_DEBUG);            BOOST_SPIRIT_DEBUG_TRACE_RULE(fullccl, BOOST_SLEX_DEBUG);            BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl, BOOST_SLEX_DEBUG);            BOOST_SPIRIT_DEBUG_TRACE_RULE(string, BOOST_SLEX_DEBUG);            BOOST_SPIRIT_DEBUG_TRACE_RULE(escseq, BOOST_SLEX_DEBUG);            BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl_char, BOOST_SLEX_DEBUG);            #undef BOOST_SLEX_DEBUG        }        rule<ScannerT> const&        start() const { return regex; }    };    std::stack<node*> &node_stack;}; // class lexer_grammartemplate <typename StringT>inline node *parse(lexer_grammar& g, StringT const& str){    typedef        scanner<typename StringT::const_iterator, scanner_policies<> >        scanner_t;    typedef typename rule<scanner_t>::template result<scanner_t>::type        result_t;            typename StringT::const_iterator first = str.begin();    typename StringT::const_iterator last = str.end();    scanner_t scan(first, last);//    typename rule<scanner_t>::result_t hit = g.parse(scan);    result_t hit = g.parse(scan);    if (!hit || !scan.at_end())    {        while (g.node_stack.size())        {            delete g.node_stack.top();            g.node_stack.pop();        }        return 0;    }    BOOST_ASSERT(g.node_stack.size() == 1);    node* rval = g.node_stack.top();    g.node_stack.pop();    node* an_eof_node = new eof_node();    rval = new cat_node(rval, an_eof_node);    return rval;}inlinevoid make_case_insensitive(state_match_t& state_match){    // TODO: Fix this.    // This doesn't take into account foreign languages, figure out how to    // do that. Also this approach is broken for this implementation of    // wide chars.    for (state_match_t::iterator iter = state_match.begin();            iter != state_match.end(); ++iter)    {        int i, j;        for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j)        {            // if either is set, turn them both on            (*iter)[i] = (*iter)[j] = uchar((*iter)[i] | (*iter)[j]);        }    }}template<bool wide_char>struct regex_match_helper;template<>struct regex_match_helper<false> // single byte char{    template <typename DfaT, typename IteratorT>    static bool    do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,        int& regex_index,             std::basic_string<            typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type        > *token)    {        typedef std::basic_string<            typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type        > string_type;        typedef typename string_type::size_type size_type;                node_id_t s = 0;        node_id_t last_accepting_index = invalid_node;        IteratorT p = first;        IteratorT last_accepting_cpos = first;        while (p != last)        {            s = dfa.transition_table[s][(uchar)*p];            if (s == invalid_node)                break;            if (token) token->append((size_type)1, *p);            ++p;            if (dfa.acceptance_index[s] != invalid_node)            {                last_accepting_index = s;                last_accepting_cpos = p;            }        }        if (last_accepting_index != invalid_node)        {#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)            std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " <<                dfa.acceptance_index[last_accepting_index] << '\n';#endif            first = last_accepting_cpos;            regex_index = dfa.acceptance_index[last_accepting_index];            return true;        }        else            return false;    }};template<>struct regex_match_helper<true> // wide char{    template <typename DfaT, typename IteratorT>    static bool    do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,        int& regex_index,        std::basic_string<            typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type        > *token)    {        typedef            typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type            char_t;        typedef std::basic_string<char_t> string_type;        typedef typename string_type::size_type size_type;        node_id_t s = 0;        node_id_t last_accepting_index = invalid_node;        IteratorT wp = first;        IteratorT last_accepting_cpos = first;        while (wp != last)        {            for (unsigned int i = 0;  i < sizeof(char_t); ++i)            {                s = dfa.transition_table[s][get_byte(*wp,i)];                if (s == invalid_node)                {                    goto break_while;                }            }            if (token) token->append((size_type)1, *wp);            ++wp;            if (dfa.acceptance_index[s] != invalid_node)            {                last_accepting_index = s;                last_accepting_cpos = wp;            }        }    break_while:        if (last_accepting_index != invalid_node)        {#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)            std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " <<                dfa.acceptance_index[last_accepting_index] << '\n';#endif            first = last_accepting_cpos;            regex_index = dfa.acceptance_index[last_accepting_index];            return true;        }        else            return false;    }};template <typename DfaT, typename IteratorT, bool wide_char>struct regex_match{    static bool    do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,        int& regex_index,        std::basic_string<            typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type        > *token)    {        return regex_match_helper<wide_char>::do_match(            dfa, first, last, regex_index, token);    }};} // namespace lexerimpl/////////////////////////////////////////////////////////////////////////////////template <typename IteratorT = char const*, typename TokenT = int,          typename CallbackT = void(*)(IteratorT const &,                                       IteratorT &,                                       IteratorT const&,                                       TokenT const&,                                       lexer_control<TokenT> &)>class lexer{public:    typedef CallbackT callback_t;    typedef        typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type        char_t;    struct regex_info    {        std::basic_string<char_t>   str;        TokenT                      token;        CallbackT                   callback;        regex_info(const std::basic_string<char_t>& _str,                const TokenT& _token,                const CallbackT& _callback)            : str(_str)            , token(_token)            , callback(_callback)            {}    };    struct dfa_table    {        std::vector<std::vector<node_id_t> >    transition_table;        std::vector<node_id_t>                  acceptance_index;    };    typedef std::vector<node_id_t> node_table_t;    typedef std::vector<node_table_t> transition_table_t;    typedef std::vector<dfa_table> dfa_t;    lexer(unsigned int states = 1);    void register_regex(const std::basic_string<char_t>& regex,            const TokenT& id, const CallbackT& cb = CallbackT(),            unsigned int state = 0);    TokenT next_token(IteratorT &first, IteratorT const &last,        std::basic_string<char_t> *token = 0);    void create_dfa();    bool has_compiled_dfa() { return m_compiled_dfa; }        void set_case_insensitive(bool insensitive);#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)    void dump(std::ostream& out);#endif    typedef std::vector<std::vector<regex_info> > regex_list_t;    bool load (std::ifstream &in, long unique_id = 0);    bool save (std::ofstream &out, long unique_id = 0);    enum {        SLEX_SIGNATURE = 0x58454C53,    // "SLEX"        SLEX_VERSION_100 = 0x0100,      // persistance version        SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100,        SLEX_MINOR_VERSION_MASK = 0xFF    };private:    void create_dfa_for_state(int state);    static bool regex_match(const dfa_t& dfa, IteratorT& first,        IteratorT& last, int& regex_index);    mutable std::stack<lexerimpl::node*> node_stack;    lexerimpl::lexer_grammar g;    mutable bool m_compiled_dfa;    mutable dfa_t m_dfa;    regex_list_t m_regex_list;    bool m_case_insensitive;    unsigned int m_state;    std::stack<unsigned int> m_state_stack;    unsigned int m_num_states;};template <typename IteratorT, typename TokenT, typename CallbackT>inlinelexer<IteratorT, TokenT, CallbackT>::lexer(unsigned int states)    : g(node_stack)    , m_compiled_dfa(false)    , m_regex_list(states)    , m_case_insensitive(false)    , m_state(0)    , m_state_stack()    , m_num_states(states){    BOOST_SPIRIT_DEBUG_TRACE_NODE_NAME(g, "slex::lexer",         BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX);}template <typename IteratorT, typename TokenT, typename CallbackT>inline voidlexer<IteratorT, TokenT, CallbackT>::register_regex(        const std::basic_string<char_t>& regex, const TokenT& id,        const CallbackT& callback, unsigned int state){    if (state > m_num_states) {        m_regex_list.resize(state);        m_num_states = state;    }    m_regex_list[state].push_back(regex_info(regex, id, callback));}template <typename IteratorT, typename TokenT, typename CallbackT>inline TokenTlexer<IteratorT, TokenT, CallbackT>::next_token(    IteratorT &first, IteratorT const& last,    std::basic_string<        typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type    > *token){    if (!m_compiled_dfa)    {        create_dfa();    }    IteratorT saved = first;    int regex_index;    if (!lexerimpl::regex_match<dfa_table, IteratorT, (sizeof(char_t) > 1)>::            do_match(m_dfa[m_state], first, last, regex_index, token))        return -1;  // TODO: can't return -1, need to return some invalid token.    // how to figure this out?  We can use traits I guess.    else    {        regex_info regex = m_regex_list[m_state][regex_index];        TokenT rval = regex.token;        if (regex.callback)        {            // execute corresponding callback            lexer_control<TokenT> controller(rval, m_state, m_state_stack);            regex.callback(saved, first, last, regex.token, controller);            if (controller.ignore_current_token_set()) {                if (token)                    token->erase();                return next_token(first, last, token);            }        }        return rval;    }}namespace lexerimpl{inlinebool find_acceptance_state(const node_set& eof_node_ids,        const node_set& current_set,        node_id_t& acceptance_node_id){    for(node_set::const_iterator nsi = eof_node_ids.begin();            nsi != eof_node_ids.end(); ++nsi)    {        node_id_t eof_node_id =*nsi;        if (current_set.end() != current_set.find(eof_node_id))        {            // store the first one we come to as the            // matched pattern            acceptance_node_id = eof_node_id;            // don't bother searching for more            return true;        }    }    return false;}template <typename RegexListT, typename GrammarT>inline std::auto_ptr<node>parse_regexes(const RegexListT& regex_list, GrammarT& g){    // parse the expressions into a tree    if (regex_list.begin() == regex_list.end())        boost::throw_exception(bad_regex());    typename RegexListT::const_iterator ri = regex_list.begin();    std::auto_ptr<node> tree(lexerimpl::parse(g, (*ri).str));    if (tree.get() == 0)        boost::throw_exception(bad_regex());    ++ri;    for (/**/; ri != regex_list.end(); ++ri)    {        std::auto_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str));        if (next_tree.get() == 0)            boost::throw_exception(bad_regex());        std::auto_ptr<node> newnode(new or_node(tree.release(), next_tree.release()));        tree = newnode;    }    return tree;}} //namespace lexerimpltemplate <typename IteratorT, typename TokenT, typename CallbackT>inline voidlexer<IteratorT, TokenT, CallbackT>::create_dfa(){    m_dfa.resize(m_num_states);    for (unsigned int i = 0; i < m_num_states; ++i)        create_dfa_for_state(i);}// Algorithm from Compilers: Principles, Techniques, and Tools p. 141template <typename IteratorT, typename TokenT, typename CallbackT>inline voidlexer<IteratorT, TokenT, CallbackT>::create_dfa_for_state(int state){    using lexerimpl::node;    std::auto_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g);    node_id_t dummy = 0;    tree->assign_node_ids(dummy);#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)    tree->dump(std::cout);#endif    // compute followpos(root)    followpos_t followpos;    tree->compute_followpos(followpos);    // the dfa states <-> nfa state groups    std::map<node_set, node_id_t> dstates1;    std::map<node_id_t, node_set> dstates2;    // the dfa transitions    m_dfa[state].transition_table.push_back(            std::vector<node_id_t>(256, invalid_node));    m_dfa[state].acceptance_index.push_back(invalid_node);    // whether the dfa state has been processed yet    std::vector<node_id_t> marked;    // used to give a unique id to each dfa state    node_id_t num_states = 0;    // initially, the only unmarked state in Dstates is firstpos(root).    marked.push_back(0);    node_set fpr = tree->firstpos();    dstates1[fpr] = 0;    dstates2[0] = fpr;    state_match_t state_match;    tree->compute_state_match(state_match);    if (m_case_insensitive)        lexerimpl::make_case_insensitive(state_match);    node_set eof_node_ids;    tree->get_eof_ids(eof_node_ids);    // translate the eof_node_ids into a 0-based index    std::map<node_id_t, node_id_t> eof_node_id_map;    unsigned int x = 0;    for (node_set::iterator node_id_it = eof_node_ids.begin();            node_id_it != eof_node_ids.
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -