⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 aho_corasick.h

📁 一个类似STL的自动机的源代码库
💻 H
字号:
/*
 * ASTL - the Automaton Standard Template Library.
 * C++ generic components for Finite State Automata handling.
 * Copyright (C) 2000-2003 Vincent Le Maout (vincent.lemaout@chello.fr).
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */


#ifndef ASTL_AHO_CORASICK_H
#define ASTL_AHO_CORASICK_H

#include <astl.h> 
#include <cursor.h>
#include <dfa_matrix.h>
#include <tools.h>
#include <tag.h>
#include <astl_tree.h>
//#include <queue>
//#include <set>
//#include <dfa_compact.h>
#include <dot.h>

using namespace std;

ASTL_BEGIN_NAMESPACE

template <typename StateType>
class aho_corasick_tag
{
protected:
  size_t length_;
  StateType subst;

public:
  typedef aho_corasick_tag self;

  aho_corasick_tag(size_t len = 0)
    : length_(len)
  { }

  size_t length() const {
    return length_;
  }

  self& length(size_t i) {
    length_ = i;
    return *this;
  }

  StateType substitute() const {
    return subst;
  }

  self& substitute(const StateType &q) {
    subst = q;
    return *this;
  }
};
  
// A cursor adapter used by aho&corasick search algorithm
template <typename CharTraits>
class aho_corasick_cursor 
  : public cursor<DFA_matrix<CharTraits, 
			     aho_corasick_tag<typename DFA_matrix<CharTraits, 
								  empty_tag>::state_type> > >
{
protected:
  typedef DFA_matrix<CharTraits, 
		     aho_corasick_tag<typename DFA_matrix<CharTraits, 
							  empty_tag>::state_type> > DFA;
  smart_ptr<DFA> p;

public:
  typedef cursor<DFA>                 super;
  typedef aho_corasick_cursor         self;
  typedef typename super::char_type   char_type;
  typedef typename super::state_type  state_type;
  typedef typename super::tag_type    tag_type;
  typedef typename super::char_traits char_traits;
  
  aho_corasick_cursor() { 
    dfa = p;
  }

  // built from a list of word (a sequence of containers)
  ///  template <typename InputIterator>
  aho_corasick_cursor(istream &in)   /// aho_corasick_cursor(InputIterator begin, InputIterator end) 
  {
    string s;
    while (!in.eof()) { ///for(; begin != end; ++begin) {
      ///      typename iterator_traits<InputIterator>::value_type::const_iterator 
      ///	start = begin->begin(), finish = end->end();
      in >> s;
      if (in.eof()) break;
      cerr << "adding '" << s << "'" << endl;
      tree_build(*p, s.begin(), s.end(), aho_corasick_tag<typename DFA_matrix<CharTraits,
		 ///      tree_build(*p, start, finish, aho_corasick_tag<typename DFA_matrix<CharTraits, 
		 empty_tag>::state_type>(s.size()));		
      ///							  empty_tag>::state_type>(distance(start, finish)));
      cerr << "added '" << s << "'" << endl;
    }
        dot(cout, dfirstc(*p));
    typename DFA::const_iterator start, finish;
    for(start = p->begin(), finish = p->end(); start != finish; ++start)
      if (*start != p->initial())
	p->tag(*start).substitute(p->initial());
      else
	p->tag(*start).substitute(p->null_state);
    
    cerr << "initiated" << endl;
    bfirst_cursor<queue_cursor<forward_cursor<DFA> > > first = bfirstc(*dfa), last;
    cursor<DFA> c(*dfa);

    while(first != last)
      do {
	// find the first substitute having a transition labelled with
	// current letter
	c = first.aim();
	for(c = c.src_tag().substitute(); 
	    c.src() != p->initial() && !c.exists(first.letter()); 
	    c = c.src_tag().substitute());
	if (c.forward(first.letter())) {
	  p->tag(first.aim()).substitute(c.src());
	  if (c.src_final()) {
	    p->final(first.aim()) = true;
	    if (c.src_tag().length() > first.aim_tag().length())
	      p->tag(first.aim()).length(c.src_tag().length());  
	  }
	}
	else p->tag(first.aim()).substitute(p->initial());
	
      } while (first.next_transition()); 
    cerr << "ok" << endl;

    dot(cout, dfirstc(*dfa));
#if 0
#endif

  }
  
  bool forward(const char_type& a) 
  {
    typename DFA::state_type s = p->delta1(q, a);
    if (s != DFA::null_state) {
      q = s;
      return true;
    }
    if (p->tag(q).substitute() != DFA::null_state) {
      *this = p->tag(q).substitute();
      return forward(a);
    }
    return true;
  }

  self& operator=(const state_type &p) {
    super::operator=(p);
    return *this;
  }
};

 #if 0
template <typename InputIterator>
inline
aho_corasick_cursor<>
aho_corasickc()
{
  return aho_corasick_cursor<cursor<DFA> >(cursor<DFA>(A, A.initial()), d);
}


template <typename DFA>
void aho_corasick(DFA &A, const typename DFA::char_type &def)
{
  typename DFA::const_iterator start, finish;
  for(start = A.begin(), finish = A.end(); start != finish; ++start)
    if (*start != A.initial())
      set_trans(*start, default_letter, A.initial());

  bfirst_cursor<queue_cursor<forward_cursor<DFA> > > first = bfirstc(A), last;
  cursor<DFA> c(A);
  while(first != last)
    do {
      if (first.letter() != def) {
	// find the first substitute having a transition labelled with
	// current letter
	c = first.aim();
	for(c.forward(def); c != A.initial() && !c.exists(first.letter()); c.forward(def));
	if (c.forward(first.letter())) {
	  A.change_trans(first.aim(), def, c.src());
	  if (c.src_final())
	    A.final(first.aim()) = true;
	  if (c.src_tag() > first.aim_tag())
	    A.tag(first.aim()) = c.src_tag();  
	}
	else A.change_trans(first.aim(), def, A.initial());
	
    } while (first.next_transition()); 
}
  

template <class DFA>
class DFA_aho_corasick : public DFA_concept
{
private:
  DFA dfa;
  typename DFA::Tag default_contructed_tag;

public:
  typedef typename DFA::Sigma          Sigma;
  typedef typename DFA::Alphabet       Alphabet;
  typedef typename DFA::State          State;
  typedef typename DFA::Tag            Tag;
  typedef typename DFA::Edges          Edges;
  typedef typename DFA::const_iterator const_iterator;
  
  State&   null_state;
  Alphabet default_letter;   

private:
  State current_state;

public:
  DFA_aho_corasick(const Alphabet &def_letter, unsigned long n = 0)
    : dfa(n), default_contructed_tag(),
      null_state(dfa.null_state), default_letter(def_letter)
    { }

  ~DFA_aho_corasick()
    { }

  State new_state() {
    return (dfa.new_state());
  }

  template <class OutputIterator>
  OutputIterator new_state(unsigned long how_many, OutputIterator x) {
    return (dfa.new_state(how_many, x));
  }

  void del_state(State s) {
    dfa.del_state(s);
  }

  void set_trans(State from, const Alphabet &a, State to) {
    dfa.set_trans(from, a, to);
  }

  void del_trans(State s, const Alphabet &a) {
    dfa.del_trans(s, a);
  }

  void change_trans(State s, const Alphabet &a, State new_aim) {
    dfa.change_trans(s, a, new_aim);
  }

  void copy_state(State from, State to) {
    dfa.copy_state(from, to);
  }

  State duplicate_state(State s) {
    return (dfa.duplicate_state(s));
  }

  Tag&           tag(State s) {
    return (dfa.tag(s));
  }

  const Tag& tag(State s) const {
    return (dfa.tag(s));
  }

  State delta1(State s, const Alphabet &a) const {
    State aim;
    return (((aim = dfa.delta1(s, a)) == null_state) ? 
	    dfa.delta1(s, default_letter) : aim);
  }

  // No use of default transitions:
  State delta0(State s, const Alphabet &a) const {
    return (dfa.delta1(s, a));
  }

  Edges delta2(State s) const {
    return (dfa.delta2(s));
  }

  State initial() const {
    return (dfa.initial());
  }
  
  void initial(State s) {
    dfa.initial(s);
  }

  unsigned long  state_count() const {
    return (dfa.state_count());
  }

  unsigned long  trans_count() const {
    return (dfa.trans_count());
  }

  const_iterator begin() const {
    return (dfa.begin());
  }

  const_iterator end() const {
    return (dfa.end());
  }

  // Encapsulating DFA::bool_reference for final() return value
  class bool_reference
  {
    DFA &dfa;
    State s;

  public:
    bool_reference(DFA &_dfa, State q)
      : dfa(_dfa), s(q)
      { }
    
    operator bool() const {
      return (dfa.final(s));
    }

    template <class T>
    bool_reference operator = (T t) {
      dfa.final(s) = t;
      return (*this);
    }

    template <class T>
    bool operator == (T t) const { 
      return (dfa.final(s) == t);
    }
  };
		   
  class const_bool_reference
  {
    const DFA &dfa;
    State s;

  public:
    const_bool_reference(const DFA &_dfa, State q)
      : dfa(_dfa), s(q)
      { }
    
    operator bool() const {
      return (dfa.final(s));
    }

    template <class T>       // ok if T is castable to bool
    bool operator == (T t) const { 
      return (dfa.final(s) == t);
    }
  };

  const_bool_reference final(State s) const {
    return (const_bool_reference(dfa, s));
  }
	
  bool_reference final(State s) {
    return (bool_reference(dfa, s));
  }
	
  void build()
    {
      Edges edges; 
      typename Edges::const_iterator trans, trans_end;
		
      // Initialize subsitutes to null_state for i and to i for others
      const_iterator start, finish;
      for(start = begin(), finish = end(); start != finish; ++start)
	if (*start != initial())
	  set_trans(*start, default_letter, initial());
	else
	  set_trans(*start, default_letter, null_state);
			
      // Queue-in the initial state transitions aims:
      State p_aim, p, q;
      queue<State> path;
      set<State> visited;
      edges = delta2(initial());
      for(trans = edges.begin(), trans_end = edges.end(); trans != trans_end; ++trans)
	if ((*trans).first != default_letter)
	  path.push((*trans).second);
				
      for(; !path.empty();)
	{
	  q = path.front();
	  path.pop();
	  edges = delta2(q);
	  for(trans = edges.begin(), trans_end = edges.end(); trans != trans_end; ++trans)
	    {
	      if ((*trans).first != default_letter)
		{
		  for(p = delta0(q, default_letter);  // p is q's subsitute
		      (p_aim = delta0(p, (*trans).first)) == null_state && p != initial();
		      p = delta0(p, default_letter));

		  if (p_aim != null_state)
		    {
		      change_trans((*trans).second, default_letter, p_aim);
		      if (final(p_aim))
			{
			  final((*trans).second) = true;
			  if (tag(p_aim) > tag((*trans).second))
			    tag((*trans).second) = tag(p_aim);  
			}
		    }
		  else
		    change_trans((*trans).second, default_letter, initial());

		  if (visited.find((*trans).second) == visited.end())   // not already visited ?
		    {
		      path.push((*trans).second);
		      visited.insert((*trans).second);
		    }
		}
	    }
	}
      reset();
    }
	
  State ac_delta1(State p, const Alphabet &a)
    {
      State q, tmp = p;
      while(1)
	{
	  q = delta0(p, a);
	  if (q != null_state)
	    {
	      if (tmp != p)
		set_trans(tmp, a, q);      // lazy optimization
	      return (q);
	    }
	  if (p == initial())
	    {
	      set_trans(tmp, a, p);        // lazy optimization
	      return (p);
	    }
	  p = delta0(p, default_letter);
	}
    }

  template <class InputIterator>
  Tag next_match(InputIterator &start, InputIterator &finish)
    {
      while(start != finish)
	{
	  current_state = ac_delta1(current_state, *start);
	  ++start; // ++start; aho-corasick hacking: read one char out of two
	  if (final(current_state)) return (tag(current_state));
	}
      return (default_contructed_tag);
    }

  void reset(State q) {
    current_state = q;
  }

  void reset() {
    current_state = initial();
  }
};  

template <class DFA>
class DFA_aho_corasick_compact : public DFA_compact<DFA>
{
private:
  typedef DFA_compact<DFA> super;
  State current_state;
  Alphabet default_letter;
  Tag default_constructed;

public:
  DFA_aho_corasick_compact(istream &in, const Alphabet &def_letter)
    : super(in), default_letter(def_letter), default_constructed()
    { }

  DFA_aho_corasick_compact(const Alphabet &def_letter)
    : default_letter(def_letter)
    { }

  // Delta1 const method: no lazy optimization
  State ac_delta1(State p, const Alphabet &a) const
    {
      State q;
      for(q = delta1(p, a); q == null_state && p != initial(); p = delta1(p, default_letter), q = delta1(p, a));
      return (q == null_state ? initial() : q);
    }

  template <class InputIterator>
  Tag next_match(InputIterator &start, InputIterator &finish)
    {
      while(start != finish)
	{
	  current_state = ac_delta1(current_state, *start);
	  ++start; 
	  //// TEMPORAIRE:
	  ////++start;
	  if (final(current_state)) 
	    return (tag(current_state));
	}
      return (default_constructed);
    }

  void reset(State q) {
    current_state = q;
  }

  void reset() {
    current_state = initial();
  }
};  

#endif

ASTL_END_NAMESPACE

#endif // ASTL_AHO_CORASICK_H






⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -