📄 lexer.hpp

📁 C++的一个好库。。。现在很流行
💻 HPP
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45

            singleton2 =
                    ch_p('*')[make_star<ScannerT>(self.node_stack)]
                    >> singleton2
                |   ch_p('+')[make_plus<ScannerT>(self.node_stack)]
                    >> singleton2
                |   ch_p('?')[make_optional<ScannerT>(self.node_stack)]
                    >> singleton2
                |   ('{' >> uint_p >> '}')
                    [
                        make_rep1<ScannerT>(self.node_stack)
                    ]
                    >>  singleton2
                |   ('{' >> uint_p >> ',' >> '}')
                    [
                        make_rep2<ScannerT>(self.node_stack)
                    ]
                    >>  singleton2
                |   ('{' >> uint_p >> ',' >> uint_p >> '}')
                    [
                        make_rep3<ScannerT>(self.node_stack)
                    ]
                    >> singleton2
                |   epsilon_p
                ;

            fullccl =
                    ('[' >> !ch_p('^') >> ccl >> ']')
                    [
                        make_ccl<ScannerT>(self.node_stack)
                    ]
                ;

            ccl =
                   *(ccl_expr | (ccl_char >> !('-' >> ccl_char)))
                ;

            ccl_char =
                    ( (anychar_p - chset<>("\\\n]")) | escseq )
                ;

            ccl_expr =
                    "[:alnum:]",
                    "[:alpha:]",
                    "[:blank:]",
                    "[:cntrl:]",
                    "[:digit:]",
                    "[:graph:]",
                    "[:lower:]",
                    "[:print:]",
                    "[:punct:]",
                    "[:space:]",
                    "[:upper:]",
                    "[:xdigit:]"
                ;

            string =
                    +( (anychar_p - chset<>("\"\\")) | escseq )
                ;

            typedef
                uint_parser<char_t, 8, 1,
                    std::numeric_limits<char_t>::digits / 3 + 1
                > oct_parser_t;
            typedef
                uint_parser<char_t, 16, 1,
                    std::numeric_limits<char_t>::digits / 4 + 1
                > hex_parser_t;

            escseq =
                    ch_p('\\')
                    >>  (
                            oct_parser_t()
                        |   as_lower_d['x'] >> hex_parser_t()
                        |   (anychar_p - chset<>('\n'))
                        )
                ;

#define BOOST_SLEX_DEBUG (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)

            BOOST_SPIRIT_DEBUG_TRACE_RULE(regex, BOOST_SLEX_DEBUG);
            BOOST_SPIRIT_DEBUG_TRACE_RULE(re, BOOST_SLEX_DEBUG);
            BOOST_SPIRIT_DEBUG_TRACE_RULE(series, BOOST_SLEX_DEBUG);
            BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton, BOOST_SLEX_DEBUG);
            BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton2, BOOST_SLEX_DEBUG);
            BOOST_SPIRIT_DEBUG_TRACE_RULE(fullccl, BOOST_SLEX_DEBUG);
            BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl, BOOST_SLEX_DEBUG);
            BOOST_SPIRIT_DEBUG_TRACE_RULE(string, BOOST_SLEX_DEBUG);
            BOOST_SPIRIT_DEBUG_TRACE_RULE(escseq, BOOST_SLEX_DEBUG);
            BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl_char, BOOST_SLEX_DEBUG);
            
#undef BOOST_SLEX_DEBUG
        }

        rule<ScannerT> const&
        start() const { return regex; }
    };

    std::stack<node*> &node_stack;

}; // class lexer_grammar

template <typename StringT>
inline node *
parse(lexer_grammar& g, StringT const& str)
{
    typedef
        scanner<typename StringT::const_iterator, scanner_policies<> >
        scanner_t;
    typedef typename rule<scanner_t>::template result<scanner_t>::type
        result_t;
        
    typename StringT::const_iterator first = str.begin();
    typename StringT::const_iterator last = str.end();

    scanner_t scan(first, last);
//    typename rule<scanner_t>::result_t hit = g.parse(scan);
    result_t hit = g.parse(scan);
    if (!hit || !scan.at_end())
    {
        while (g.node_stack.size())
        {
            delete g.node_stack.top();
            g.node_stack.pop();
        }
        return 0;
    }

    BOOST_ASSERT(g.node_stack.size() == 1);
    node* rval = g.node_stack.top();
    g.node_stack.pop();
    node* an_eof_node = new eof_node();
    rval = new cat_node(rval, an_eof_node);
    return rval;
}

inline
void make_case_insensitive(state_match_t& state_match)
{
    // TODO: Fix this.
    // This doesn't take into account foreign languages, figure out how to
    // do that. Also this approach is broken for this implementation of
    // wide chars.
    for (state_match_t::iterator iter = state_match.begin();
            iter != state_match.end(); ++iter)
    {
        int i, j;
        for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j)
        {
            // if either is set, turn them both on
            (*iter)[i] = (*iter)[j] = uchar((*iter)[i] | (*iter)[j]);
        }
    }
}

template<bool wide_char>
struct regex_match_helper;

template<>
struct regex_match_helper<false> // single byte char
{
    template <typename DfaT, typename IteratorT>
    static bool
    do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
        int& regex_index,     
        std::basic_string<
            typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
        > *token)
    {
        typedef std::basic_string<
            typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
        > string_type;
        typedef typename string_type::size_type size_type;
        
        node_id_t s = 0;
        node_id_t last_accepting_index = invalid_node;
        IteratorT p = first;
        IteratorT last_accepting_cpos = first;
        while (p != last)
        {
            s = dfa.transition_table[s][(uchar)*p];
            if (s == invalid_node)
                break;
            if (token) token->append((size_type)1, *p);
            ++p;
            if (dfa.acceptance_index[s] != invalid_node)
            {
                last_accepting_index = s;
                last_accepting_cpos = p;
            }
        }
        if (last_accepting_index != invalid_node)
        {
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
            std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " <<
                dfa.acceptance_index[last_accepting_index] << '\n';
#endif

            first = last_accepting_cpos;
            regex_index = dfa.acceptance_index[last_accepting_index];
            return true;
        }
        else
            return false;
    }
};

template<>
struct regex_match_helper<true> // wide char
{
    template <typename DfaT, typename IteratorT>
    static bool
    do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
        int& regex_index,
        std::basic_string<
            typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
        > *token)
    {
        typedef
            typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
            char_t;
        typedef std::basic_string<char_t> string_type;
        typedef typename string_type::size_type size_type;

        node_id_t s = 0;
        node_id_t last_accepting_index = invalid_node;
        IteratorT wp = first;
        IteratorT last_accepting_cpos = first;

        while (wp != last)
        {
            for (unsigned int i = 0;  i < sizeof(char_t); ++i)
            {
                s = dfa.transition_table[s][get_byte(*wp,i)];
                if (s == invalid_node)
                {
                    goto break_while;
                }
            }
            if (token) token->append((size_type)1, *wp);
            ++wp;
            if (dfa.acceptance_index[s] != invalid_node)
            {
                last_accepting_index = s;
                last_accepting_cpos = wp;
            }

        }

    break_while:
        if (last_accepting_index != invalid_node)
        {
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
            std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " <<
                dfa.acceptance_index[last_accepting_index] << '\n';
#endif
            first = last_accepting_cpos;
            regex_index = dfa.acceptance_index[last_accepting_index];

            return true;
        }
        else
            return false;
    }
};

template <typename DfaT, typename IteratorT, bool wide_char>
struct regex_match
{
    static bool
    do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
        int& regex_index,
        std::basic_string<
            typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
        > *token)
    {
        return regex_match_helper<wide_char>::do_match(
            dfa, first, last, regex_index, token);
    }
};

} // namespace lexerimpl

///////////////////////////////////////////////////////////////////////////////
//
template <typename IteratorT = char const*, typename TokenT = int,
          typename CallbackT = void(*)(IteratorT const &,
                                       IteratorT &,
                                       IteratorT const&,
                                       TokenT const&,
                                       lexer_control<TokenT> &)>
class lexer
{
public:
    typedef CallbackT callback_t;
    typedef
        typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
        char_t;

    struct regex_info
    {
        std::basic_string<char_t>   str;
        TokenT                      token;
        CallbackT                   callback;

        regex_info(const std::basic_string<char_t>& _str,
                const TokenT& _token,
                const CallbackT& _callback)
            : str(_str)
            , token(_token)
            , callback(_callback)
            {}

    };

    struct dfa_table
    {
        std::vector<std::vector<node_id_t> >    transition_table;
        std::vector<node_id_t>                  acceptance_index;
    };
    typedef std::vector<node_id_t> node_table_t;
    typedef std::vector<node_table_t> transition_table_t;
    typedef std::vector<dfa_table> dfa_t;


    lexer(unsigned int states = 1);

    void register_regex(const std::basic_string<char_t>& regex,
            const TokenT& id, const CallbackT& cb = CallbackT(),
            unsigned int state = 0);

    TokenT next_token(IteratorT &first, IteratorT const &last,
        std::basic_string<char_t> *token = 0);

    void create_dfa();
    bool has_compiled_dfa() { return m_compiled_dfa; }
    
    void set_case_insensitive(bool insensitive);

#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
    void dump(std::ostream& out);
#endif
    typedef std::vector<std::vector<regex_info> > regex_list_t;

    bool load (std::ifstream &in, long unique_id = 0);
    bool save (std::ofstream &out, long unique_id = 0);
    enum {
        SLEX_SIGNATURE = 0x58454C53,    // "SLEX"
        SLEX_VERSION_100 = 0x0100,      // persistance version
        SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100,
        SLEX_MINOR_VERSION_MASK = 0xFF
    };

private:

    void create_dfa_for_state(int state);

    static bool regex_match(const dfa_t& dfa, IteratorT& first,
        IteratorT& last, int& regex_index);

    mutable std::stack<lexerimpl::node*> node_stack;
    lexerimpl::lexer_grammar g;

    mutable bool m_compiled_dfa;
    mutable dfa_t m_dfa;

    regex_list_t m_regex_list;
    bool m_case_insensitive;

    unsigned int m_state;
    std::stack<unsigned int> m_state_stack;
    unsigned int m_num_states;
};


template <typename IteratorT, typename TokenT, typename CallbackT>
inline
lexer<IteratorT, TokenT, CallbackT>::lexer(unsigned int states)
    : g(node_stack)
    , m_compiled_dfa(false)
    , m_regex_list(states)
    , m_case_insensitive(false)
    , m_state(0)
    , m_state_stack()
    , m_num_states(states)
{
    BOOST_SPIRIT_DEBUG_TRACE_NODE_NAME(g, "slex::lexer", 
        BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX);
}

template <typename IteratorT, typename TokenT, typename CallbackT>
inline void
lexer<IteratorT, TokenT, CallbackT>::register_regex(
        const std::basic_string<char_t>& regex, const TokenT& id,
        const CallbackT& callback, unsigned int state)
{
    if (state > m_num_states) {
        m_regex_list.resize(state);
        m_num_states = state;
    }
    m_regex_list[state].push_back(regex_info(regex, id, callback));
}

template <typename IteratorT, typename TokenT, typename CallbackT>
inline TokenT
lexer<IteratorT, TokenT, CallbackT>::next_token(
    IteratorT &first, IteratorT const& last,
    std::basic_string<
        typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
    > *token)
{
    if (!m_compiled_dfa)
    {
        create_dfa();
    }

    IteratorT saved = first;
    int regex_index;
    if (!lexerimpl::regex_match<dfa_table, IteratorT, (sizeof(char_t) > 1)>::
            do_match(m_dfa[m_state], first, last, regex_index, token))
        return -1;  // TODO: can't return -1, need to return some invalid token.
    // how to figure this out?  We can use traits I guess.
    else
    {
        regex_info regex = m_regex_list[m_state][regex_index];
        TokenT rval = regex.token;
        if (regex.callback)
        {
            // execute corresponding callback
            lexer_control<TokenT> controller(rval, m_state, m_state_stack);
            regex.callback(saved, first, last, regex.token, controller);
            if (controller.ignore_current_token_set()) {
                if (token)
                    token->erase();
                return next_token(first, last, token);
            }
        }
        return rval;
    }
}

namespace lexerimpl
{

inline
bool find_acceptance_state(const node_set& eof_node_ids,
        const node_set& current_set,
        node_id_t& acceptance_node_id)
{
    for(node_set::const_iterator nsi = eof_node_ids.begin();
            nsi != eof_node_ids.end(); ++nsi)
    {
        node_id_t eof_node_id =*nsi;
        if (current_set.end() != current_set.find(eof_node_id))
        {
            // store the first one we come to as the
            // matched pattern
            acceptance_node_id = eof_node_id;
            // don't bother searching for more
            return true;
        }
    }
    return false;
}

template <typename RegexListT, typename GrammarT>
inline std::auto_ptr<node>
parse_regexes(const RegexListT& regex_list, GrammarT& g)
{
    // parse the expressions into a tree
    if (regex_list.begin() == regex_list.end())
        throw bad_regex();

    typename RegexListT::const_iterator ri = regex_list.begin();
    std::auto_ptr<node> tree(lexerimpl::parse(g, (*ri).str));
    if (tree.get() == 0)
        throw bad_regex();

    ++ri;
    for (/**/; ri != regex_list.end(); ++ri)
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -