📄 aho_corasick.h
字号:
/*
* ASTL - the Automaton Standard Template Library.
* C++ generic components for Finite State Automata handling.
* Copyright (C) 2000-2003 Vincent Le Maout (vincent.lemaout@chello.fr).
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
#ifndef ASTL_AHO_CORASICK_H
#define ASTL_AHO_CORASICK_H
#include <astl.h>
#include <cursor.h>
#include <dfa_matrix.h>
#include <tools.h>
#include <tag.h>
#include <astl_tree.h>
//#include <queue>
//#include <set>
//#include <dfa_compact.h>
#include <dot.h>
using namespace std;
ASTL_BEGIN_NAMESPACE
template <typename StateType>
class aho_corasick_tag
{
protected:
size_t length_;
StateType subst;
public:
typedef aho_corasick_tag self;
aho_corasick_tag(size_t len = 0)
: length_(len)
{ }
size_t length() const {
return length_;
}
self& length(size_t i) {
length_ = i;
return *this;
}
StateType substitute() const {
return subst;
}
self& substitute(const StateType &q) {
subst = q;
return *this;
}
};
// A cursor adapter used by aho&corasick search algorithm
template <typename CharTraits>
class aho_corasick_cursor
: public cursor<DFA_matrix<CharTraits,
aho_corasick_tag<typename DFA_matrix<CharTraits,
empty_tag>::state_type> > >
{
protected:
typedef DFA_matrix<CharTraits,
aho_corasick_tag<typename DFA_matrix<CharTraits,
empty_tag>::state_type> > DFA;
smart_ptr<DFA> p;
public:
typedef cursor<DFA> super;
typedef aho_corasick_cursor self;
typedef typename super::char_type char_type;
typedef typename super::state_type state_type;
typedef typename super::tag_type tag_type;
typedef typename super::char_traits char_traits;
aho_corasick_cursor() {
dfa = p;
}
// built from a list of word (a sequence of containers)
/// template <typename InputIterator>
aho_corasick_cursor(istream &in) /// aho_corasick_cursor(InputIterator begin, InputIterator end)
{
string s;
while (!in.eof()) { ///for(; begin != end; ++begin) {
/// typename iterator_traits<InputIterator>::value_type::const_iterator
/// start = begin->begin(), finish = end->end();
in >> s;
if (in.eof()) break;
cerr << "adding '" << s << "'" << endl;
tree_build(*p, s.begin(), s.end(), aho_corasick_tag<typename DFA_matrix<CharTraits,
/// tree_build(*p, start, finish, aho_corasick_tag<typename DFA_matrix<CharTraits,
empty_tag>::state_type>(s.size()));
/// empty_tag>::state_type>(distance(start, finish)));
cerr << "added '" << s << "'" << endl;
}
dot(cout, dfirstc(*p));
typename DFA::const_iterator start, finish;
for(start = p->begin(), finish = p->end(); start != finish; ++start)
if (*start != p->initial())
p->tag(*start).substitute(p->initial());
else
p->tag(*start).substitute(p->null_state);
cerr << "initiated" << endl;
bfirst_cursor<queue_cursor<forward_cursor<DFA> > > first = bfirstc(*dfa), last;
cursor<DFA> c(*dfa);
while(first != last)
do {
// find the first substitute having a transition labelled with
// current letter
c = first.aim();
for(c = c.src_tag().substitute();
c.src() != p->initial() && !c.exists(first.letter());
c = c.src_tag().substitute());
if (c.forward(first.letter())) {
p->tag(first.aim()).substitute(c.src());
if (c.src_final()) {
p->final(first.aim()) = true;
if (c.src_tag().length() > first.aim_tag().length())
p->tag(first.aim()).length(c.src_tag().length());
}
}
else p->tag(first.aim()).substitute(p->initial());
} while (first.next_transition());
cerr << "ok" << endl;
dot(cout, dfirstc(*dfa));
#if 0
#endif
}
bool forward(const char_type& a)
{
typename DFA::state_type s = p->delta1(q, a);
if (s != DFA::null_state) {
q = s;
return true;
}
if (p->tag(q).substitute() != DFA::null_state) {
*this = p->tag(q).substitute();
return forward(a);
}
return true;
}
self& operator=(const state_type &p) {
super::operator=(p);
return *this;
}
};
#if 0
template <typename InputIterator>
inline
aho_corasick_cursor<>
aho_corasickc()
{
return aho_corasick_cursor<cursor<DFA> >(cursor<DFA>(A, A.initial()), d);
}
template <typename DFA>
void aho_corasick(DFA &A, const typename DFA::char_type &def)
{
typename DFA::const_iterator start, finish;
for(start = A.begin(), finish = A.end(); start != finish; ++start)
if (*start != A.initial())
set_trans(*start, default_letter, A.initial());
bfirst_cursor<queue_cursor<forward_cursor<DFA> > > first = bfirstc(A), last;
cursor<DFA> c(A);
while(first != last)
do {
if (first.letter() != def) {
// find the first substitute having a transition labelled with
// current letter
c = first.aim();
for(c.forward(def); c != A.initial() && !c.exists(first.letter()); c.forward(def));
if (c.forward(first.letter())) {
A.change_trans(first.aim(), def, c.src());
if (c.src_final())
A.final(first.aim()) = true;
if (c.src_tag() > first.aim_tag())
A.tag(first.aim()) = c.src_tag();
}
else A.change_trans(first.aim(), def, A.initial());
} while (first.next_transition());
}
template <class DFA>
class DFA_aho_corasick : public DFA_concept
{
private:
DFA dfa;
typename DFA::Tag default_contructed_tag;
public:
typedef typename DFA::Sigma Sigma;
typedef typename DFA::Alphabet Alphabet;
typedef typename DFA::State State;
typedef typename DFA::Tag Tag;
typedef typename DFA::Edges Edges;
typedef typename DFA::const_iterator const_iterator;
State& null_state;
Alphabet default_letter;
private:
State current_state;
public:
DFA_aho_corasick(const Alphabet &def_letter, unsigned long n = 0)
: dfa(n), default_contructed_tag(),
null_state(dfa.null_state), default_letter(def_letter)
{ }
~DFA_aho_corasick()
{ }
State new_state() {
return (dfa.new_state());
}
template <class OutputIterator>
OutputIterator new_state(unsigned long how_many, OutputIterator x) {
return (dfa.new_state(how_many, x));
}
void del_state(State s) {
dfa.del_state(s);
}
void set_trans(State from, const Alphabet &a, State to) {
dfa.set_trans(from, a, to);
}
void del_trans(State s, const Alphabet &a) {
dfa.del_trans(s, a);
}
void change_trans(State s, const Alphabet &a, State new_aim) {
dfa.change_trans(s, a, new_aim);
}
void copy_state(State from, State to) {
dfa.copy_state(from, to);
}
State duplicate_state(State s) {
return (dfa.duplicate_state(s));
}
Tag& tag(State s) {
return (dfa.tag(s));
}
const Tag& tag(State s) const {
return (dfa.tag(s));
}
State delta1(State s, const Alphabet &a) const {
State aim;
return (((aim = dfa.delta1(s, a)) == null_state) ?
dfa.delta1(s, default_letter) : aim);
}
// No use of default transitions:
State delta0(State s, const Alphabet &a) const {
return (dfa.delta1(s, a));
}
Edges delta2(State s) const {
return (dfa.delta2(s));
}
State initial() const {
return (dfa.initial());
}
void initial(State s) {
dfa.initial(s);
}
unsigned long state_count() const {
return (dfa.state_count());
}
unsigned long trans_count() const {
return (dfa.trans_count());
}
const_iterator begin() const {
return (dfa.begin());
}
const_iterator end() const {
return (dfa.end());
}
// Encapsulating DFA::bool_reference for final() return value
class bool_reference
{
DFA &dfa;
State s;
public:
bool_reference(DFA &_dfa, State q)
: dfa(_dfa), s(q)
{ }
operator bool() const {
return (dfa.final(s));
}
template <class T>
bool_reference operator = (T t) {
dfa.final(s) = t;
return (*this);
}
template <class T>
bool operator == (T t) const {
return (dfa.final(s) == t);
}
};
class const_bool_reference
{
const DFA &dfa;
State s;
public:
const_bool_reference(const DFA &_dfa, State q)
: dfa(_dfa), s(q)
{ }
operator bool() const {
return (dfa.final(s));
}
template <class T> // ok if T is castable to bool
bool operator == (T t) const {
return (dfa.final(s) == t);
}
};
const_bool_reference final(State s) const {
return (const_bool_reference(dfa, s));
}
bool_reference final(State s) {
return (bool_reference(dfa, s));
}
void build()
{
Edges edges;
typename Edges::const_iterator trans, trans_end;
// Initialize subsitutes to null_state for i and to i for others
const_iterator start, finish;
for(start = begin(), finish = end(); start != finish; ++start)
if (*start != initial())
set_trans(*start, default_letter, initial());
else
set_trans(*start, default_letter, null_state);
// Queue-in the initial state transitions aims:
State p_aim, p, q;
queue<State> path;
set<State> visited;
edges = delta2(initial());
for(trans = edges.begin(), trans_end = edges.end(); trans != trans_end; ++trans)
if ((*trans).first != default_letter)
path.push((*trans).second);
for(; !path.empty();)
{
q = path.front();
path.pop();
edges = delta2(q);
for(trans = edges.begin(), trans_end = edges.end(); trans != trans_end; ++trans)
{
if ((*trans).first != default_letter)
{
for(p = delta0(q, default_letter); // p is q's subsitute
(p_aim = delta0(p, (*trans).first)) == null_state && p != initial();
p = delta0(p, default_letter));
if (p_aim != null_state)
{
change_trans((*trans).second, default_letter, p_aim);
if (final(p_aim))
{
final((*trans).second) = true;
if (tag(p_aim) > tag((*trans).second))
tag((*trans).second) = tag(p_aim);
}
}
else
change_trans((*trans).second, default_letter, initial());
if (visited.find((*trans).second) == visited.end()) // not already visited ?
{
path.push((*trans).second);
visited.insert((*trans).second);
}
}
}
}
reset();
}
State ac_delta1(State p, const Alphabet &a)
{
State q, tmp = p;
while(1)
{
q = delta0(p, a);
if (q != null_state)
{
if (tmp != p)
set_trans(tmp, a, q); // lazy optimization
return (q);
}
if (p == initial())
{
set_trans(tmp, a, p); // lazy optimization
return (p);
}
p = delta0(p, default_letter);
}
}
template <class InputIterator>
Tag next_match(InputIterator &start, InputIterator &finish)
{
while(start != finish)
{
current_state = ac_delta1(current_state, *start);
++start; // ++start; aho-corasick hacking: read one char out of two
if (final(current_state)) return (tag(current_state));
}
return (default_contructed_tag);
}
void reset(State q) {
current_state = q;
}
void reset() {
current_state = initial();
}
};
template <class DFA>
class DFA_aho_corasick_compact : public DFA_compact<DFA>
{
private:
typedef DFA_compact<DFA> super;
State current_state;
Alphabet default_letter;
Tag default_constructed;
public:
DFA_aho_corasick_compact(istream &in, const Alphabet &def_letter)
: super(in), default_letter(def_letter), default_constructed()
{ }
DFA_aho_corasick_compact(const Alphabet &def_letter)
: default_letter(def_letter)
{ }
// Delta1 const method: no lazy optimization
State ac_delta1(State p, const Alphabet &a) const
{
State q;
for(q = delta1(p, a); q == null_state && p != initial(); p = delta1(p, default_letter), q = delta1(p, a));
return (q == null_state ? initial() : q);
}
template <class InputIterator>
Tag next_match(InputIterator &start, InputIterator &finish)
{
while(start != finish)
{
current_state = ac_delta1(current_state, *start);
++start;
//// TEMPORAIRE:
////++start;
if (final(current_state))
return (tag(current_state));
}
return (default_constructed);
}
void reset(State q) {
current_state = q;
}
void reset() {
current_state = initial();
}
};
#endif
ASTL_END_NAMESPACE
#endif // ASTL_AHO_CORASICK_H
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -