📄 lexer.hpp
字号:
singleton2 =
ch_p('*')[make_star<ScannerT>(self.node_stack)]
>> singleton2
| ch_p('+')[make_plus<ScannerT>(self.node_stack)]
>> singleton2
| ch_p('?')[make_optional<ScannerT>(self.node_stack)]
>> singleton2
| ('{' >> uint_p >> '}')
[
make_rep1<ScannerT>(self.node_stack)
]
>> singleton2
| ('{' >> uint_p >> ',' >> '}')
[
make_rep2<ScannerT>(self.node_stack)
]
>> singleton2
| ('{' >> uint_p >> ',' >> uint_p >> '}')
[
make_rep3<ScannerT>(self.node_stack)
]
>> singleton2
| epsilon_p
;
fullccl =
('[' >> !ch_p('^') >> ccl >> ']')
[
make_ccl<ScannerT>(self.node_stack)
]
;
ccl =
*(ccl_expr | (ccl_char >> !('-' >> ccl_char)))
;
ccl_char =
( (anychar_p - chset<>("\\\n]")) | escseq )
;
ccl_expr =
"[:alnum:]",
"[:alpha:]",
"[:blank:]",
"[:cntrl:]",
"[:digit:]",
"[:graph:]",
"[:lower:]",
"[:print:]",
"[:punct:]",
"[:space:]",
"[:upper:]",
"[:xdigit:]"
;
string =
+( (anychar_p - chset<>("\"\\")) | escseq )
;
typedef
uint_parser<char_t, 8, 1,
std::numeric_limits<char_t>::digits / 3 + 1
> oct_parser_t;
typedef
uint_parser<char_t, 16, 1,
std::numeric_limits<char_t>::digits / 4 + 1
> hex_parser_t;
escseq =
ch_p('\\')
>> (
oct_parser_t()
| as_lower_d['x'] >> hex_parser_t()
| (anychar_p - chset<>('\n'))
)
;
#define BOOST_SLEX_DEBUG (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
BOOST_SPIRIT_DEBUG_TRACE_RULE(regex, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(re, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(series, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton2, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(fullccl, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(string, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(escseq, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl_char, BOOST_SLEX_DEBUG);
#undef BOOST_SLEX_DEBUG
}
rule<ScannerT> const&
start() const { return regex; }
};
std::stack<node*> &node_stack;
}; // class lexer_grammar
template <typename StringT>
inline node *
parse(lexer_grammar& g, StringT const& str)
{
typedef
scanner<typename StringT::const_iterator, scanner_policies<> >
scanner_t;
typedef typename rule<scanner_t>::template result<scanner_t>::type
result_t;
typename StringT::const_iterator first = str.begin();
typename StringT::const_iterator last = str.end();
scanner_t scan(first, last);
// typename rule<scanner_t>::result_t hit = g.parse(scan);
result_t hit = g.parse(scan);
if (!hit || !scan.at_end())
{
while (g.node_stack.size())
{
delete g.node_stack.top();
g.node_stack.pop();
}
return 0;
}
BOOST_ASSERT(g.node_stack.size() == 1);
node* rval = g.node_stack.top();
g.node_stack.pop();
node* an_eof_node = new eof_node();
rval = new cat_node(rval, an_eof_node);
return rval;
}
inline
void make_case_insensitive(state_match_t& state_match)
{
// TODO: Fix this.
// This doesn't take into account foreign languages, figure out how to
// do that. Also this approach is broken for this implementation of
// wide chars.
for (state_match_t::iterator iter = state_match.begin();
iter != state_match.end(); ++iter)
{
int i, j;
for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j)
{
// if either is set, turn them both on
(*iter)[i] = (*iter)[j] = uchar((*iter)[i] | (*iter)[j]);
}
}
}
template<bool wide_char>
struct regex_match_helper;
template<>
struct regex_match_helper<false> // single byte char
{
template <typename DfaT, typename IteratorT>
static bool
do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
int& regex_index,
std::basic_string<
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
> *token)
{
typedef std::basic_string<
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
> string_type;
typedef typename string_type::size_type size_type;
node_id_t s = 0;
node_id_t last_accepting_index = invalid_node;
IteratorT p = first;
IteratorT last_accepting_cpos = first;
while (p != last)
{
s = dfa.transition_table[s][(uchar)*p];
if (s == invalid_node)
break;
if (token) token->append((size_type)1, *p);
++p;
if (dfa.acceptance_index[s] != invalid_node)
{
last_accepting_index = s;
last_accepting_cpos = p;
}
}
if (last_accepting_index != invalid_node)
{
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " <<
dfa.acceptance_index[last_accepting_index] << '\n';
#endif
first = last_accepting_cpos;
regex_index = dfa.acceptance_index[last_accepting_index];
return true;
}
else
return false;
}
};
template<>
struct regex_match_helper<true> // wide char
{
template <typename DfaT, typename IteratorT>
static bool
do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
int& regex_index,
std::basic_string<
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
> *token)
{
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
char_t;
typedef std::basic_string<char_t> string_type;
typedef typename string_type::size_type size_type;
node_id_t s = 0;
node_id_t last_accepting_index = invalid_node;
IteratorT wp = first;
IteratorT last_accepting_cpos = first;
while (wp != last)
{
for (unsigned int i = 0; i < sizeof(char_t); ++i)
{
s = dfa.transition_table[s][get_byte(*wp,i)];
if (s == invalid_node)
{
goto break_while;
}
}
if (token) token->append((size_type)1, *wp);
++wp;
if (dfa.acceptance_index[s] != invalid_node)
{
last_accepting_index = s;
last_accepting_cpos = wp;
}
}
break_while:
if (last_accepting_index != invalid_node)
{
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " <<
dfa.acceptance_index[last_accepting_index] << '\n';
#endif
first = last_accepting_cpos;
regex_index = dfa.acceptance_index[last_accepting_index];
return true;
}
else
return false;
}
};
template <typename DfaT, typename IteratorT, bool wide_char>
struct regex_match
{
static bool
do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
int& regex_index,
std::basic_string<
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
> *token)
{
return regex_match_helper<wide_char>::do_match(
dfa, first, last, regex_index, token);
}
};
} // namespace lexerimpl
///////////////////////////////////////////////////////////////////////////////
//
template <typename IteratorT = char const*, typename TokenT = int,
typename CallbackT = void(*)(IteratorT const &,
IteratorT &,
IteratorT const&,
TokenT const&,
lexer_control<TokenT> &)>
class lexer
{
public:
typedef CallbackT callback_t;
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
char_t;
struct regex_info
{
std::basic_string<char_t> str;
TokenT token;
CallbackT callback;
regex_info(const std::basic_string<char_t>& _str,
const TokenT& _token,
const CallbackT& _callback)
: str(_str)
, token(_token)
, callback(_callback)
{}
};
struct dfa_table
{
std::vector<std::vector<node_id_t> > transition_table;
std::vector<node_id_t> acceptance_index;
};
typedef std::vector<node_id_t> node_table_t;
typedef std::vector<node_table_t> transition_table_t;
typedef std::vector<dfa_table> dfa_t;
lexer(unsigned int states = 1);
void register_regex(const std::basic_string<char_t>& regex,
const TokenT& id, const CallbackT& cb = CallbackT(),
unsigned int state = 0);
TokenT next_token(IteratorT &first, IteratorT const &last,
std::basic_string<char_t> *token = 0);
void create_dfa();
bool has_compiled_dfa() { return m_compiled_dfa; }
void set_case_insensitive(bool insensitive);
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
void dump(std::ostream& out);
#endif
typedef std::vector<std::vector<regex_info> > regex_list_t;
bool load (std::ifstream &in, long unique_id = 0);
bool save (std::ofstream &out, long unique_id = 0);
enum {
SLEX_SIGNATURE = 0x58454C53, // "SLEX"
SLEX_VERSION_100 = 0x0100, // persistance version
SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100,
SLEX_MINOR_VERSION_MASK = 0xFF
};
private:
void create_dfa_for_state(int state);
static bool regex_match(const dfa_t& dfa, IteratorT& first,
IteratorT& last, int& regex_index);
mutable std::stack<lexerimpl::node*> node_stack;
lexerimpl::lexer_grammar g;
mutable bool m_compiled_dfa;
mutable dfa_t m_dfa;
regex_list_t m_regex_list;
bool m_case_insensitive;
unsigned int m_state;
std::stack<unsigned int> m_state_stack;
unsigned int m_num_states;
};
template <typename IteratorT, typename TokenT, typename CallbackT>
inline
lexer<IteratorT, TokenT, CallbackT>::lexer(unsigned int states)
: g(node_stack)
, m_compiled_dfa(false)
, m_regex_list(states)
, m_case_insensitive(false)
, m_state(0)
, m_state_stack()
, m_num_states(states)
{
BOOST_SPIRIT_DEBUG_TRACE_NODE_NAME(g, "slex::lexer",
BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX);
}
template <typename IteratorT, typename TokenT, typename CallbackT>
inline void
lexer<IteratorT, TokenT, CallbackT>::register_regex(
const std::basic_string<char_t>& regex, const TokenT& id,
const CallbackT& callback, unsigned int state)
{
if (state > m_num_states) {
m_regex_list.resize(state);
m_num_states = state;
}
m_regex_list[state].push_back(regex_info(regex, id, callback));
}
template <typename IteratorT, typename TokenT, typename CallbackT>
inline TokenT
lexer<IteratorT, TokenT, CallbackT>::next_token(
IteratorT &first, IteratorT const& last,
std::basic_string<
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
> *token)
{
if (!m_compiled_dfa)
{
create_dfa();
}
IteratorT saved = first;
int regex_index;
if (!lexerimpl::regex_match<dfa_table, IteratorT, (sizeof(char_t) > 1)>::
do_match(m_dfa[m_state], first, last, regex_index, token))
return -1; // TODO: can't return -1, need to return some invalid token.
// how to figure this out? We can use traits I guess.
else
{
regex_info regex = m_regex_list[m_state][regex_index];
TokenT rval = regex.token;
if (regex.callback)
{
// execute corresponding callback
lexer_control<TokenT> controller(rval, m_state, m_state_stack);
regex.callback(saved, first, last, regex.token, controller);
if (controller.ignore_current_token_set()) {
if (token)
token->erase();
return next_token(first, last, token);
}
}
return rval;
}
}
namespace lexerimpl
{
inline
bool find_acceptance_state(const node_set& eof_node_ids,
const node_set& current_set,
node_id_t& acceptance_node_id)
{
for(node_set::const_iterator nsi = eof_node_ids.begin();
nsi != eof_node_ids.end(); ++nsi)
{
node_id_t eof_node_id =*nsi;
if (current_set.end() != current_set.find(eof_node_id))
{
// store the first one we come to as the
// matched pattern
acceptance_node_id = eof_node_id;
// don't bother searching for more
return true;
}
}
return false;
}
template <typename RegexListT, typename GrammarT>
inline std::auto_ptr<node>
parse_regexes(const RegexListT& regex_list, GrammarT& g)
{
// parse the expressions into a tree
if (regex_list.begin() == regex_list.end())
throw bad_regex();
typename RegexListT::const_iterator ri = regex_list.begin();
std::auto_ptr<node> tree(lexerimpl::parse(g, (*ri).str));
if (tree.get() == 0)
throw bad_regex();
++ri;
for (/**/; ri != regex_list.end(); ++ri)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -