📄 ag_regex.h
字号:
/*! \file AG_RegEx.h
\brief Regular Expression Parser
\author Amer Gerzic
Use it as following: \n
\n
string strPattern; \n
int nPos = 0; \n
\n
if(SetRegEx(".....")) \n
{ \n
if(FindFirst(...)) \n
{ \n
// Do something with result \n
while(FindNext(...)) \n
{ \n
// Process Result \n
} \n
} \n
} \n
*/
#pragma once
// for debug STL exceeds 255 chars and MS compiler complains, so disable it
#pragma warning(disable:4786)
#pragma warning(disable:4503)
#include <string>
#include <stack>
#include <map>
#include <deque>
#include <vector>
#include <set>
#include <algorithm>
#include <list>
using namespace std;
class CAG_RegEx
{
protected:
//! State Class
class CAG_State
{
protected:
//! Transitions from this state to other
multimap<char, CAG_State*> m_Transition;
//! State ID
int m_nStateID;
//! Set of NFA state from which this state is constructed
set<CAG_State*> m_NFAStates;
public:
//! Default constructor
CAG_State() : m_nStateID(-1), m_bAcceptingState(FALSE) {};
//! parameterized constructor
CAG_State(int nID) : m_nStateID(nID), m_bAcceptingState(FALSE) {};
//! Constructs new state from the set of other states
/*! This is necessary for subset construction algorithm
because there a new DFA state is constructed from
one or more NFA states
*/
CAG_State(set<CAG_State*> NFAState, int nID)
{
m_NFAStates = NFAState;
m_nStateID = nID;
// DFA state is accepting state if it is constructed from
// an accepting NFA state
m_bAcceptingState = FALSE;
set<CAG_State*>::iterator iter;
for(iter=NFAState.begin(); iter!=NFAState.end(); ++iter)
if((*iter)->m_bAcceptingState)
m_bAcceptingState = TRUE;
};
//! Copy Constructor
CAG_State(const CAG_State &other)
{ *this = other; };
//! Destructor
virtual ~CAG_State() {};
//! True if this state is accepting state
BOOL m_bAcceptingState;
//! Adds a transition from this state to the other
void AddTransition(char chInput, CAG_State *pState)
{
ASSERT(pState != NULL);
m_Transition.insert(make_pair(chInput, pState));
};
//! Removes all transition that go from this state to pState
void RemoveTransition(CAG_State* pState)
{
multimap<char, CAG_State*>::iterator iter;
for(iter=m_Transition.begin(); iter!=m_Transition.end();)
{
CAG_State *toState = iter->second;
if(toState == pState)
m_Transition.erase(iter++);
else ++iter;
}
};
//! Returns all transitions from this state on specific input
void GetTransition(char chInput, vector<CAG_State*> &States)
{
// clear the array first
States.clear();
// Iterate through all values with the key chInput
multimap<char, CAG_State*>::iterator iter;
for(iter = m_Transition.lower_bound(chInput);
iter!= m_Transition.upper_bound(chInput);
++iter)
{
CAG_State *pState = iter->second;
ASSERT(pState != NULL);
States.push_back(pState);
}
};
//! Returns the state id in form of string
CString GetStateID()
{
CString strRes;
if(m_bAcceptingState)
strRes.Format(_T("{%d}"), m_nStateID);
else strRes.Format(_T("%d"), m_nStateID);
return strRes;
};
/*! Returns the set of NFA states from
which this DFA state was constructed
*/
set<CAG_State*>& GetNFAState()
{ return m_NFAStates; };
//! Returns true if this state is dead end
/*! By dead end I mean that this state is not
accepting state and there are no transitions
leading away from this state. This function
is used for reducing the DFA.
*/
BOOL IsDeadEnd()
{
if(m_bAcceptingState)
return FALSE;
if(m_Transition.empty())
return TRUE;
multimap<char, CAG_State*>::iterator iter;
for(iter=m_Transition.begin(); iter!=m_Transition.end(); ++iter)
{
CAG_State *toState = iter->second;
if(toState != this)
return FALSE;
}
TRACE("State %d is dead end.\n", m_nStateID);
return TRUE;
};
//! Override the assignment operator
CAG_State& operator=(const CAG_State& other)
{
m_Transition = other.m_Transition;
m_nStateID = other.m_nStateID;
m_NFAStates = other.m_NFAStates;
};
//! Override the comparison operator
BOOL operator==(const CAG_State& other)
{
if(m_NFAStates.empty())
return(m_nStateID == other.m_nStateID);
else return(m_NFAStates == other.m_NFAStates);
};
};
//! Structure to track the pattern recognition
class CAG_PatternState
{
public:
//! Default Constructor
CAG_PatternState() : m_pState(NULL), m_nStartIndex(-1) {};
//! Copy Constructor
CAG_PatternState(const CAG_PatternState &other)
{ *this = other; };
//! Destructor
virtual ~CAG_PatternState() {};
//! Pointer to current state in DFA
CAG_State* m_pState;
//! 0-Based index of the starting position
int m_nStartIndex;
//! Override the assignment operator
CAG_PatternState& operator=(const CAG_PatternState& other)
{
ASSERT(other.m_pState != NULL);
m_pState = other.m_pState;
m_nStartIndex = other.m_nStartIndex;
return *this;
};
};
//! NFA Table
typedef deque<CAG_State*> FSA_TABLE;
/*! NFA Table is stored in a deque of CAG_States.
Each CAG_State object has a multimap of
transitions where the key is the input
character and values are the references to
states to which it transfers.
*/
FSA_TABLE m_NFATable;
//! DFA table is stores in same format as NFA table
FSA_TABLE m_DFATable;
//! Operand Stack
/*! If we use the Thompson's Algorithm then we realize
that each operand is a NFA automata on its own!
*/
stack<FSA_TABLE> m_OperandStack;
//! Operator Stack
/*! Operators are simple characters like "*" & "|" */
stack<char> m_OperatorStack;
//! Keeps track of state IDs
int m_nNextStateID;
//! Set of input characters
set<char> m_InputSet;
//! List of current partially found patterns
list<CAG_PatternState*> m_PatternList;
//! Contains the current patterns accessed
int m_nPatternIndex;
//! Contains the text to check
string m_strText;
//! Contains all found patterns
vector<string> m_vecPattern;
//! Contains all position where these pattern start in the text
vector<int> m_vecPos;
//! Constructs basic Thompson Construction Element
/*! Constructs basic NFA for single character and
pushes it onto the stack.
*/
void Push(char chInput);
//! Pops an element from the operand stack
/*! The return value is TRUE if an element
was poped successfully, otherwise it is
FALSE (syntax error)
*/
BOOL Pop(FSA_TABLE &NFATable);
//! Checks is a specific character and operator
BOOL IsOperator(char ch) { return((ch == 42) || (ch == 124) || (ch == 40) || (ch == 41) || (ch == 8)); };
//! Returns operator presedence
/*! Returns TRUE if presedence of opLeft <= opRight.
Kleens Closure - highest
Concatenation - middle
Union - lowest
*/
BOOL Presedence(char opLeft, char opRight)
{
if(opLeft == opRight)
return TRUE;
if(opLeft == '*')
return FALSE;
if(opRight == '*')
return TRUE;
if(opLeft == 8)
return FALSE;
if(opRight == 8)
return TRUE;
if(opLeft == '|')
return FALSE;
return TRUE;
};
//! Checks if the specific character is input character
BOOL IsInput(char ch) { return(!IsOperator(ch)); };
//! Checks is a character left parantheses
BOOL IsLeftParanthesis(char ch) { return(ch == 40); };
//! Checks is a character right parantheses
BOOL IsRightParanthesis(char ch) { return(ch == 41); };
//! Evaluates the next operator from the operator stack
BOOL Eval();
//! Evaluates the concatenation operator
/*! This function pops two operands from the stack
and evaluates the concatenation on them, pushing
the result back on the stack.
*/
BOOL Concat();
//! Evaluates the Kleen's closure - star operator
/*! Pops one operator from the stack and evaluates
the star operator on it. It pushes the result
on the operand stack again.
*/
BOOL Star();
//! Evaluates the union operator
/*! Pops 2 operands from the stack and evaluates
the union operator pushing the result on the
operand stack.
*/
BOOL Union();
//! Inserts char 8 where the concatenation needs to occur
/*! The method used to parse regular expression here is
similar to method of evaluating the arithmetic expressions.
A difference here is that each arithmetic expression is
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -