📄 regexp.h
字号:
/* * ASTL - the Automaton Standard Template Library. * C++ generic components for Finite State Automata handling. * Copyright (C) 2000-2003 Vincent Le Maout (vincent.lemaout@chello.fr). * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */#ifndef ASTL_REGEXP_H#define ASTL_REGEXP_H#include <astl.h>#include <tools.h>#include <concept.h>#include <vector>#include <iostream>#include <set>#include <iterator>#include <cctype> // isdigit#include <bitset>using namespace std;ASTL_BEGIN_NAMESPACE// regular expressions grammar:// exp -> term exp2// exp2 -> + term exp2 | epsilon// term -> form repeat term2// term2 -> . form repeat term2 | epsilon// repeat -> * | ? | + | epsilon// form -> ( exp ) | letter class regexp_cursor : public cursor_concept{protected: struct node; typedef set<node*> S; typedef enum { STAR, CONCAT, OR, REPEAT, EPSILON, LETTER, OPEN, CLOSE, QUESTION, PLUS, ANY, FINAL } token_type; struct node { token_type type; int n, m; // repeat operator bitset<256> letter; bool annulable; const char *pos; S premiere_pos, derniere_pos, pos_suivante; node(token_type t, const char *p = 0, int n_ = 0, int m_ = 0) : type(t), n(n_), m(m_), pos(p) { if (type == ANY) { type = LETTER; letter.set(); } } node(char t, const char *p = 0) : type(LETTER), annulable(false), pos(p) { letter.set((unsigned char) t); } node(const bitset<256> &t, const char *p = 0) : type(LETTER), letter(t), annulable(false), pos(p) { } bool final() const { return type == FINAL; } }; friend std::ostream& operator<<(std::ostream &, const regexp_cursor::node &); S q; // current state smart_ptr<vector<node> > e; // node "allocator" string errmsg; node* exp(vector<node>::iterator &first, vector<node>::iterator last); node* form(vector<node>::iterator &first, vector<node>::iterator last); node* term(vector<node>::iterator &first, vector<node>::iterator last); node* exp2(vector<node>::iterator &first, vector<node>::iterator last, node *left); node* term2(vector<node>::iterator &first, vector<node>::iterator last, node *left); node* repeat(vector<node>::iterator &first, vector<node>::iterator last, node *left); template <typename OutputIterator> int tokenize(const char *first, const char *last, OutputIterator x, bool kmp_style = false);public: int errpos; typedef regexp_cursor self; typedef S state_type; typedef plain char_traits; typedef char_traits::char_type char_type; typedef empty_tag tag_type; regexp_cursor(const char *expression = ".", bool kmp_style = false, bool verbose = false) : errpos(-1) { e->reserve(8); tokenize(expression, expression + strlen(expression), back_inserter(*e), kmp_style); if (verbose) { cout << "tokenized expression:" << endl; copy(e->begin(), e->end(), ostream_iterator<node>(cout)); cout << endl; } vector<node>::iterator first = e->begin(); node *root = exp(first, e->end()); if (root == 0) errpos = first->pos - expression; else q = root->premiere_pos; } int error() const { // possible error position (-1 if expression parsing is ok) return errpos; } const string& error_message() const { return errmsg; } self& operator=(const state_type &s) { q = s; return *this; } state_type src() const { return q; } bool forward(char letter) { state_type p; for(S::const_iterator i = q.begin(); i != q.end(); ++i) if ((*i)->letter[(unsigned char) letter] == true) p.insert((*i)->pos_suivante.begin(), (*i)->pos_suivante.end()); q.swap(p); // optimized equivalent to q = p; return !q.empty(); } bool src_final() const { for(S::const_iterator i = q.begin(); i != q.end(); ++i) if ((*i)->final()) return true; return false; } bool sink() const { return q.empty(); } tag_type src_tag() const { return tag_type(); }};std::ostream& operator<<(std::ostream &out, const regexp_cursor::node &x) { switch(x.type) { case regexp_cursor::LETTER : out << "["; if (x.letter.count() == 256) // Sigma out << "Sigma"; else for(int i = 32; i < 256; ++i) if (x.letter[i] == true) out << (char) i; out << "]"; break; case regexp_cursor::OR : out << "|"; break; case regexp_cursor::FINAL : out << "[EoE]"; break; case regexp_cursor::OPEN : out << "("; break; case regexp_cursor::CLOSE : out << ")"; break; case regexp_cursor::CONCAT : out << " "; break; case regexp_cursor::STAR : out << "*"; break; case regexp_cursor::QUESTION : out << "?"; break; case regexp_cursor::PLUS : out << "+"; break; case regexp_cursor::REPEAT : out << "{" << x.n << "," << x.m << "}"; break; default : break; } return out;}// Return error position or -1 if oktemplate <typename OutputIterator>int regexp_cursor::tokenize(const char *first, const char *last, OutputIterator x, bool kmp_style){ *x++ = node(OPEN); if (kmp_style) { // append Sigma* *x++ = node(ANY); *x++ = node(STAR); *x++ = node(CONCAT); } for(const char *begin = first; first != last; ++first, ++x) switch (*first) { case '|' : *x = node(OR, first); break; case '(' : if (first != begin && first[-1] != '(' && first[-1] != '|') *x++ = node(CONCAT, first); *x = node(OPEN, first); break; case ')' : *x = node(CLOSE, first); break; case '*' : *x = node(STAR, first); break; case '?' : *x = node(QUESTION, first); break; case '+' : *x = node(PLUS, first); break; /* not yet implemented case '{' : if (++first == last) return first - begin; { int n = 0; const char *tmp = first; while (isdigit(*first)) { n = n * 10 + *first - '0'; if (++first == last) return first - begin; } if (tmp == first) return first - begin; int m = n; if (*first == ',') { if (++first == last) return first - begin; tmp = first; for(m = 0; isdigit(*first);) { m = m * 10 + *first - '0'; if (++first == last) return first - begin; } if (tmp == first) return first - begin; } if (*first != '}') return first - begin; if (n > m) return first - begin; *x = node(REPEAT, first, n, m); } break; */ case '[' : // characters set { if (first != begin && first[-1] != '|' && first[-1] != '(') *x++ = node(CONCAT, first); bitset<256> a; bool v = true; if (++first == last) return first - begin; if (*first == '^') { a.set(); v = false; if (++first == last) return first - begin; } while (first != last && *first != ']') { char b = *first; if (++first == last) break; if (*first != '-') { a.set((unsigned char) b, v); continue; } if (++first == last || *first == ']' || *first < b) return first - begin; for(unsigned int i = (unsigned char) b; i <= (unsigned char) *first; ++i) a.set(i, v); ++first; } if (first == last) return first - begin; // first is supposed to point to ']' *x++ = node(a); break; } // Escaped character: case '\\' : if (first != begin && first[-1] != '|' && first[-1] != '(') *x++ = node(CONCAT, first); if (++first == last) return first - begin; switch (*first) { case 'n' : *x = node('\n', first); break; case 't' : *x = node('\t', first); break; case 'r' : *x = node('\r', first); break; default : *x = node(*first, first); } break; default : if (first != begin && first[-1] != '|' && first[-1] != '(') *x++ = node(CONCAT, first); if (*first == '.') *x = node(ANY, first); else *x = node(*first, first); break; } *x++ = node(CLOSE); *x++ = node(CONCAT); *x++ = node(FINAL); return 0;}// exp -> term exp2 regexp_cursor::node*regexp_cursor::exp(vector<regexp_cursor::node>::iterator &first, vector<regexp_cursor::node>::iterator last){ if (first == last) { errmsg = "empty expression"; return 0; } node *t = term(first, last); if (t == 0) return 0; return exp2(first, last, t);}// exp2 -> + term exp2 | epsilonregexp_cursor::node*regexp_cursor::exp2(vector<regexp_cursor::node>::iterator &first, vector<regexp_cursor::node>::iterator last, node *left){ if (first != last && first->type == OR) { node &root = *first; node *right = term(++first, last); if (right == 0) return 0; root.annulable = right->annulable || left->annulable; // premiere_pos(here) = premiere_pos(left) U premiere_pos(right): root.premiere_pos.insert(left->premiere_pos.begin(), left->premiere_pos.end()); root.premiere_pos.insert(right->premiere_pos.begin(), right->premiere_pos.end()); root.derniere_pos.insert(left->derniere_pos.begin(), left->derniere_pos.end()); root.derniere_pos.insert(right->derniere_pos.begin(), right->derniere_pos.end()); return exp2(first, last, &root); } return left;}// term -> form repeat term2regexp_cursor::node*regexp_cursor::term(vector<regexp_cursor::node>::iterator &first, vector<regexp_cursor::node>::iterator last){ if (first == last) return 0; node *f = form(first, last); if (f == 0) return 0; node *s = repeat(first, last, f); // return f if no repeat token is found return term2(first, last, s);}// repeat -> * | ? | + | {n,m} | epsilonregexp_cursor::node*regexp_cursor::repeat(vector<regexp_cursor::node>::iterator &first, vector<regexp_cursor::node>::iterator last, node *left){ if (first != last) switch (first->type) { case STAR : { node &root = *first; root.annulable = true; root.premiere_pos = left->premiere_pos; root.derniere_pos = left->derniere_pos; // pos_suivante(derniere_pos(here)) += premiere_pos(here): for(S::const_iterator i = root.derniere_pos.begin(); i != root.derniere_pos.end(); ++i) (*i)->pos_suivante.insert(root.premiere_pos.begin(), root.premiere_pos.end()); ++first; return &root; } case QUESTION : { node &root = *first; root.annulable = true; root.premiere_pos = left->premiere_pos; root.derniere_pos = left->derniere_pos; ++first; return &root; } case PLUS : { node &root = *first; root.annulable = false; root.premiere_pos = left->premiere_pos; root.derniere_pos = left->derniere_pos; // pos_suivante(derniere_pos(here)) += premiere_pos(here): for(S::const_iterator i = root.derniere_pos.begin(); i != root.derniere_pos.end(); ++i) (*i)->pos_suivante.insert(root.premiere_pos.begin(), root.premiere_pos.end()); ++first; return &root; } default : return left; } return left;} // term2 -> . form repeat term2 | epsilonregexp_cursor::node*regexp_cursor::term2(vector<regexp_cursor::node>::iterator &first, vector<regexp_cursor::node>::iterator last, node *left){ if (first != last && first->type == CONCAT) { node &root = *first; node *f = form(++first, last); if (f == 0) return 0; node *right = repeat(first, last, f); root.annulable = left->annulable && right->annulable; if (left->annulable) { root.premiere_pos.insert(left->premiere_pos.begin(), left->premiere_pos.end()); root.premiere_pos.insert(right->premiere_pos.begin(), right->premiere_pos.end()); } else root.premiere_pos = left->premiere_pos; if (right->annulable) { root.derniere_pos.insert(right->derniere_pos.begin(), right->derniere_pos.end()); root.derniere_pos.insert(left->derniere_pos.begin(), left->derniere_pos.end()); } else root.derniere_pos = right->derniere_pos; // pos_suivante(derniere_pos(left)) += premiere_pos(right): for(S::const_iterator i = left->derniere_pos.begin(); i != left->derniere_pos.end(); ++i) (*i)->pos_suivante.insert(right->premiere_pos.begin(), right->premiere_pos.end()); return term2(first, last, &root); } return left;}// form -> ( exp ) | letterregexp_cursor::node*regexp_cursor::form(vector<regexp_cursor::node>::iterator &first, vector<regexp_cursor::node>::iterator last){ if (first == last) return 0; if (first->type == OPEN) { node *e = exp(++first, last); if (e == 0) return 0; if (first->type == CLOSE) ++first; else { errmsg = "unbalanced parenthesis"; return 0; } return e; } first->annulable = false; first->premiere_pos.insert(&*first); first->derniere_pos.insert(&*first); ++first; return &(first[-1]);}inlineregexp_cursor regexpc(const string &exp, bool kmp_style) { return regexp_cursor(exp.c_str(), kmp_style);}inlineregexp_cursor regexpc(const string &exp) { return regexp_cursor(exp.c_str());}inlineregexp_cursor regexpc() { return regexp_cursor();}ASTL_END_NAMESPACE#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -