⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regexp.h

📁 一个类似STL的自动机的源代码库
💻 H
字号:
/* * ASTL - the Automaton Standard Template Library. * C++ generic components for Finite State Automata handling. * Copyright (C) 2000-2003 Vincent Le Maout (vincent.lemaout@chello.fr). *  * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. *  * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU * Lesser General Public License for more details. *  * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * */#ifndef ASTL_REGEXP_H#define ASTL_REGEXP_H#include <astl.h>#include <tools.h>#include <concept.h>#include <vector>#include <iostream>#include <set>#include <iterator>#include <cctype>      // isdigit#include <bitset>using namespace std;ASTL_BEGIN_NAMESPACE// regular expressions grammar:// exp    -> term exp2// exp2   -> + term exp2 | epsilon// term   -> form repeat term2// term2  -> . form repeat term2 | epsilon// repeat -> * | ? | + | epsilon// form   -> ( exp ) | letter class regexp_cursor : public cursor_concept{protected:  struct node;  typedef set<node*> S;  typedef enum { STAR, CONCAT, OR, REPEAT, EPSILON, 		 LETTER, OPEN, CLOSE, QUESTION, PLUS,		 ANY, FINAL } token_type;    struct node {    token_type type;    int n, m;        // repeat operator    bitset<256> letter;    bool annulable;    const char *pos;    S premiere_pos, derniere_pos, pos_suivante;        node(token_type t, const char *p = 0, int n_ = 0, int m_ = 0)      : type(t), n(n_), m(m_), pos(p) {       if (type == ANY) {	type = LETTER;	letter.set();      }    }    node(char t, const char *p = 0)      : type(LETTER), annulable(false), pos(p) {       letter.set((unsigned char) t);    }    node(const bitset<256> &t, const char *p = 0)      : type(LETTER), letter(t), annulable(false), pos(p)     { }    bool final() const {      return type == FINAL;    }  };  friend std::ostream& operator<<(std::ostream &, const regexp_cursor::node &);  S q;  // current state  smart_ptr<vector<node> > e;  // node "allocator"  string errmsg;  node* exp(vector<node>::iterator &first, vector<node>::iterator last);  node* form(vector<node>::iterator &first, vector<node>::iterator last);  node* term(vector<node>::iterator &first, vector<node>::iterator last);  node* exp2(vector<node>::iterator &first, vector<node>::iterator last, node *left);  node* term2(vector<node>::iterator &first, vector<node>::iterator last, node *left);  node* repeat(vector<node>::iterator &first, vector<node>::iterator last, node *left);  template <typename OutputIterator>  int tokenize(const char *first, const char *last, OutputIterator x, 	       bool kmp_style = false);public:  int errpos;  typedef regexp_cursor          self;  typedef S                      state_type;  typedef plain                  char_traits;  typedef char_traits::char_type char_type;  typedef empty_tag              tag_type;  regexp_cursor(const char *expression = ".", bool kmp_style = false, bool verbose = false)    : errpos(-1)  {    e->reserve(8);    tokenize(expression, expression + strlen(expression), back_inserter(*e), kmp_style);    if (verbose) {      cout << "tokenized expression:" << endl;      copy(e->begin(), e->end(), ostream_iterator<node>(cout));      cout << endl;    }    vector<node>::iterator first = e->begin();    node *root = exp(first, e->end());    if (root == 0)       errpos = first->pos - expression;    else      q = root->premiere_pos;  }  int error() const {    // possible error position (-1 if expression parsing is ok)    return errpos;  }      const string& error_message() const {    return errmsg;  }  self& operator=(const state_type &s) {    q = s;    return *this;  }    state_type src() const {    return q;  }  bool forward(char letter) {    state_type p;    for(S::const_iterator i = q.begin(); i != q.end(); ++i)      if ((*i)->letter[(unsigned char) letter] == true)	p.insert((*i)->pos_suivante.begin(), (*i)->pos_suivante.end());    q.swap(p);   // optimized equivalent to q = p;    return !q.empty();  }      bool src_final() const {    for(S::const_iterator i = q.begin(); i != q.end(); ++i)      if ((*i)->final()) return true;    return false;  }       bool sink() const {    return q.empty();  }   tag_type src_tag() const {    return tag_type();  }};std::ostream& operator<<(std::ostream &out, const regexp_cursor::node &x) {  switch(x.type) {  case regexp_cursor::LETTER :    out << "[";    if (x.letter.count() == 256)   // Sigma      out << "Sigma";    else      for(int i = 32; i < 256; ++i)	if (x.letter[i] == true) out << (char) i;    out << "]";    break;  case regexp_cursor::OR :     out << "|";     break;  case regexp_cursor::FINAL :     out << "[EoE]";    break;  case regexp_cursor::OPEN :     out << "(";     break;  case regexp_cursor::CLOSE :     out << ")";     break;  case regexp_cursor::CONCAT :     out << " ";     break;  case regexp_cursor::STAR :     out << "*";     break;  case regexp_cursor::QUESTION :     out << "?";     break;  case regexp_cursor::PLUS :     out << "+";     break;  case regexp_cursor::REPEAT :     out << "{" << x.n << "," << x.m << "}";    break;  default :    break;  }   return out;}// Return error position or -1 if oktemplate <typename OutputIterator>int regexp_cursor::tokenize(const char *first, const char *last, 			    OutputIterator x, bool kmp_style){  *x++ = node(OPEN);  if (kmp_style) {  // append Sigma*    *x++ = node(ANY);    *x++ = node(STAR);    *x++ = node(CONCAT);   }  for(const char *begin = first; first != last; ++first, ++x)    switch (*first) {    case '|' :       *x = node(OR, first);       break;    case '(' :       if (first != begin && first[-1] != '(' && first[-1] != '|') 	*x++ = node(CONCAT, first);      *x = node(OPEN, first);       break;    case ')' :       *x = node(CLOSE, first);      break;    case '*' :      *x = node(STAR, first);      break;    case '?' :      *x = node(QUESTION, first);      break;    case '+' :      *x = node(PLUS, first);      break;      /* not yet implemented     case '{' :      if (++first == last) return first - begin;      {	int n = 0;	const char *tmp = first;	while (isdigit(*first)) {	  n = n * 10 + *first - '0';	  if (++first == last) return first - begin;	}	if (tmp == first) return first - begin;	int m = n;	if (*first == ',') {	  if (++first == last) return first - begin;	  tmp = first;	  for(m = 0; isdigit(*first);) {	    m = m * 10 + *first - '0';	    if (++first == last) return first - begin;	  }	  if (tmp == first) return first - begin;	}	if (*first != '}') return first - begin;	if (n > m) return first - begin;	*x = node(REPEAT, first, n, m);      }      break;      */    case '[' :   // characters set      {	if (first != begin && first[-1] != '|' && first[-1] != '(') 	  *x++ = node(CONCAT, first);	bitset<256> a;	bool v = true;	if (++first == last) return first - begin;	if (*first == '^') {	  a.set();	  v = false;	  if (++first == last) return first - begin;	}	while (first != last && *first != ']') {	  char b = *first;	  if (++first == last) break;	  if (*first != '-') {	    a.set((unsigned char) b, v);	    continue;	  }	  if (++first == last || *first == ']' || *first < b) return first - begin;	  for(unsigned int i = (unsigned char) b; i <= (unsigned char) *first; ++i)	    a.set(i, v);	  ++first;	}	if (first == last) return first - begin; // first is supposed to point to ']'	*x++ = node(a);	break;      }	        // Escaped character:    case '\\' :       if (first != begin && first[-1] != '|' && first[-1] != '(') 	*x++ = node(CONCAT, first);      if (++first == last) return first - begin;      switch (*first) {      case 'n' : *x = node('\n', first); break;      case 't' : *x = node('\t', first); break;      case 'r' : *x = node('\r', first); break;      default : *x = node(*first, first);       }      break;          default :      if (first != begin && first[-1] != '|' && first[-1] != '(') 	*x++ = node(CONCAT, first);      if (*first == '.') *x = node(ANY, first);      else	*x = node(*first, first);       break;    }  *x++ = node(CLOSE);  *x++ = node(CONCAT);  *x++ = node(FINAL);  return 0;}// exp -> term exp2    regexp_cursor::node*regexp_cursor::exp(vector<regexp_cursor::node>::iterator &first, 		   vector<regexp_cursor::node>::iterator last){  if (first == last) {    errmsg = "empty expression";    return 0;  }  node *t = term(first, last);  if (t == 0) return 0;  return exp2(first, last, t);}// exp2 -> + term exp2 | epsilonregexp_cursor::node*regexp_cursor::exp2(vector<regexp_cursor::node>::iterator &first, 		    vector<regexp_cursor::node>::iterator last, node *left){  if (first != last && first->type == OR) {    node &root = *first;    node *right = term(++first, last);    if (right == 0) return 0;    root.annulable = right->annulable || left->annulable;    // premiere_pos(here) = premiere_pos(left) U premiere_pos(right):    root.premiere_pos.insert(left->premiere_pos.begin(), left->premiere_pos.end());    root.premiere_pos.insert(right->premiere_pos.begin(), right->premiere_pos.end());    root.derniere_pos.insert(left->derniere_pos.begin(), left->derniere_pos.end());    root.derniere_pos.insert(right->derniere_pos.begin(), right->derniere_pos.end());    return exp2(first, last, &root);  }  return left;}// term -> form repeat term2regexp_cursor::node*regexp_cursor::term(vector<regexp_cursor::node>::iterator &first, 		    vector<regexp_cursor::node>::iterator last){  if (first == last) return 0;  node *f = form(first, last);  if (f == 0) return 0;  node *s = repeat(first, last, f);  // return f if no repeat token is found  return term2(first, last, s);}// repeat -> * | ? | + | {n,m} | epsilonregexp_cursor::node*regexp_cursor::repeat(vector<regexp_cursor::node>::iterator &first, 		      vector<regexp_cursor::node>::iterator last, node *left){  if (first != last)    switch (first->type) {    case STAR :      {	node &root = *first;	root.annulable = true;	root.premiere_pos = left->premiere_pos;	root.derniere_pos = left->derniere_pos;		// pos_suivante(derniere_pos(here)) += premiere_pos(here):	for(S::const_iterator i = root.derniere_pos.begin(); i != root.derniere_pos.end(); ++i)	  (*i)->pos_suivante.insert(root.premiere_pos.begin(), root.premiere_pos.end());	++first;	return &root;      }    case QUESTION :       {	node &root = *first;	root.annulable = true;	root.premiere_pos = left->premiere_pos;	root.derniere_pos = left->derniere_pos;	++first;	return &root;      }    case PLUS :      {	node &root = *first;	root.annulable = false;	root.premiere_pos = left->premiere_pos;	root.derniere_pos = left->derniere_pos;		// pos_suivante(derniere_pos(here)) += premiere_pos(here):	for(S::const_iterator i = root.derniere_pos.begin(); i != root.derniere_pos.end(); ++i)	  (*i)->pos_suivante.insert(root.premiere_pos.begin(), root.premiere_pos.end());	++first;	return &root;      }    default :       return left;    }  return left;}    // term2 -> . form repeat term2 | epsilonregexp_cursor::node*regexp_cursor::term2(vector<regexp_cursor::node>::iterator &first, 		     vector<regexp_cursor::node>::iterator last, node *left){  if (first != last && first->type == CONCAT) {    node &root = *first;    node *f = form(++first, last);    if (f == 0) return 0;    node *right = repeat(first, last, f);    root.annulable = left->annulable && right->annulable;    if (left->annulable) {      root.premiere_pos.insert(left->premiere_pos.begin(), left->premiere_pos.end());      root.premiere_pos.insert(right->premiere_pos.begin(), right->premiere_pos.end());    }    else      root.premiere_pos = left->premiere_pos;        if (right->annulable) {      root.derniere_pos.insert(right->derniere_pos.begin(), right->derniere_pos.end());      root.derniere_pos.insert(left->derniere_pos.begin(), left->derniere_pos.end());    }    else      root.derniere_pos = right->derniere_pos;    // pos_suivante(derniere_pos(left)) += premiere_pos(right):    for(S::const_iterator i = left->derniere_pos.begin(); i != left->derniere_pos.end(); ++i)      (*i)->pos_suivante.insert(right->premiere_pos.begin(), right->premiere_pos.end());    return term2(first, last, &root);  }  return left;}// form -> ( exp ) | letterregexp_cursor::node*regexp_cursor::form(vector<regexp_cursor::node>::iterator &first, 		    vector<regexp_cursor::node>::iterator last){  if (first == last) return 0;  if (first->type == OPEN) {    node *e = exp(++first, last);    if (e == 0) return 0;    if (first->type == CLOSE) ++first;    else {      errmsg = "unbalanced parenthesis";      return 0;    }    return e;  }  first->annulable = false;  first->premiere_pos.insert(&*first);  first->derniere_pos.insert(&*first);  ++first;  return &(first[-1]);}inlineregexp_cursor regexpc(const string &exp, bool kmp_style) {  return regexp_cursor(exp.c_str(), kmp_style);}inlineregexp_cursor regexpc(const string &exp) {  return regexp_cursor(exp.c_str());}inlineregexp_cursor regexpc() {  return regexp_cursor();}ASTL_END_NAMESPACE#endif 

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -