📄 rexalgorithm.cpp
字号:
#pragma warning(disable:4786)
#pragma warning(disable:4503)
#include "stdafx.h"
#include "RexAlgorithm.h"
#include <bitset>
#include <algorithm>
using namespace std;
typedef bitset<256> REXA_CharSet;
typedef int REXA_StateId;
template<class TSet>
struct REXA_LessSet{
bool operator()(const TSet& rTSet1, const TSet& rTSet2)const
{
if( rTSet1==rTSet2 ){
return false;
}
for(int i=0;i<rTSet1.size();i++){
if( rTSet1[i]!=rTSet2[i] )
return rTSet1[i]<rTSet2[i];
}
return false;
}
};
typedef REXA_LessSet<REXA_CharSet> REXA_LessCharSet;
typedef set<REXA_CharSet,REXA_LessCharSet> REXA_CharSetSet;
// State and Translationtable
struct REXA_NDFAState;
struct REXA_StateSet : public set<const struct REXA_NDFAState*>
{
bool IsAccepting ()const;
};
typedef vector<REXA_StateSet> TransitionVector;
struct REXA_NDFATransition{
REXA_NDFATransition(const REXA_CharSet& m_rTransChars,REXA_NDFAState* pStateNext)
:m_transChars(m_rTransChars),m_nextState(pStateNext)
{
}
REXA_NDFATransition(const char* pcszChars=0,REXA_NDFAState* pNextState=0)
{
if(!pcszChars || strlen(pcszChars)==0 ){
m_bIsMute=true;
return;
}
const unsigned char* pucszChars= (const unsigned char*)pcszChars;
m_nextState= pNextState;
while( *pucszChars ){
m_transChars[*pucszChars++]= true;
}
}
REXA_CharSet m_transChars;
REXA_NDFAState* m_nextState;
bool m_bIsMute;
};
struct REXA_NDFAState{
REXA_NDFAState(int id=0,bool isAccepting=false)
:m_id(id),
m_nAnswer(REXA_DFAState::eNoAnswer),
m_isAccepting(isAccepting)
{
}
static void CollectStates (const REXA_NDFAState* pState,
set<const REXA_NDFAState*>& rsetStates);
static void CollectStates (REXA_NDFAState* pState,
set<REXA_NDFAState*>& rsetStates);
static REXA_NDFAState* CopyStates (REXA_NDFAState* pState,
map<REXA_NDFAState*,REXA_NDFAState*>& rMapStates);
REXA_NDFAState* Clone ();
void DeleteAll ();
int m_id;
int m_nAnswer;
bool m_isAccepting;
vector<REXA_NDFATransition> m_transitions;
};
struct REXA_TranslatingTable{
typedef map< REXA_StateSet,TransitionVector > TranslationMap;
typedef map<const REXA_NDFAState*,int> MapStateOccurrenceCount;
REXA_TranslatingTable (const REXA_NDFAState* pStartState);
void ComputeOccCounts();
int GetAnswerCode (const REXA_StateSet& rSet)const;
vector<REXA_CharSet> m_vecCharsets;
const REXA_NDFAState* m_pStartState;
TranslationMap m_mapTransitions;
MapStateOccurrenceCount m_mapStateOccurrenceCount;
private:
vector<REXA_CharSet>
CharsetToVector (const REXA_CharSetSet& rCharSetSet);
set<REXA_CharSet,REXA_LessCharSet>
ComputeDisjointCharSets (REXA_CharSetSet& rCharSets);
void GenerateTranslationTable ( set<const REXA_NDFAState*>& rSetState);
void CollectCharSets ( const REXA_NDFAState* pCurState,
set<int>& rVisitedStates,REXA_CharSetSet& rCharSets);
};
class REXA_NDFAToTable{
public:
void NDFA_To_Table (REXA_NDFAState* pStartState,
REXA_TranslatingTable& rTable);
static vector<REXA_CharSet>
CharsetToVector (const REXA_CharSetSet& rCharSetSet);
set<REXA_CharSet,REXA_LessCharSet>
ComputeDisjointCharSets (REXA_CharSetSet& rCharSets);
static void GenerateTranslationTable ( map<int,REXA_NDFAState*>& rMapIdState,
REXA_TranslatingTable& rTable);
static void CollectCharSets ( REXA_NDFAState* pCurState,
set<int>& rVisitedStates,
REXA_CharSetSet& rCharSets);
};
class REXA_NDFAToDfa{
public:
typedef map<REXA_NDFAState*,int> MapStateOccurrenceCount;
REXA_NDFAToDfa (const REXA_TranslatingTable& rTable);
REXA_DFAState* TableToDfaStates();
private:
REXA_DFAState* CreateDfaStates(
const REXA_StateSet& rStateSet,
map<REXA_StateSet,REXA_DFAState*>& rMapStateSetDFAState);
const REXA_TranslatingTable& m_rTable;
};
class REXA_Scanner{
friend class REXA_ParserImpl;
friend class REXA_Exception;
public:
//constructor,desctructor,typedefs
REXA_Scanner(const char* pcszToParse="") {SetSource(pcszToParse);}
private:
void SetSource(const char* pcszSrc){
m_pcszToParse= pcszSrc;m_pTokBeg= pcszSrc;
m_pTokEnd= pcszSrc;m_eLastSymbol= eEof;
}
//operations
REXA_ESymbols operator() ();
int GetLastSymbol (){return m_eLastSymbol;}
static string GetSymbolString (REXA_ESymbols eRegexpSymbol);
void AdvanceAndMatch (REXA_ESymbols eSymbol);
void Match (REXA_ESymbols eSymbol);
REXA_BegEnd GetToken (){return make_pair(m_pTokBeg,m_pTokEnd);}
const char* GetSource (){return m_pcszToParse;}
REXA_CharSet GetCharSet (){return m_charSet;}
int GetChar (const char*& rpFrom,bool& rbIsEscape);
void AddRange ( unsigned char cFirst,
unsigned char cLast,
REXA_CharSet& rCharSet);
const char* SkipWhiteSpace ();
const char* m_pcszToParse;
const char* m_pTokBeg;
const char* m_pTokEnd;
REXA_CharSet m_charSet;
string m_strLiteral;
REXA_ESymbols m_eLastSymbol;
};
class REXA_ParserImpl{
public:
REXA_ParserImpl ()
:m_nStateIdMax(0)
{
}
REXA_ParserImpl (const REXA_ParserImpl& parser);
REXA_ParserImpl& operator= (const REXA_ParserImpl& parser);
~REXA_ParserImpl();
const REXA_NDFAState* ParseRegExp (string strRegExp);
bool ParseRegExpDefinition (string strRegExpDefiniton,int nAnswer);
map<string,REXA_NDFAState*> m_mapNamedStates;
private:
// grammar related functions
void NamedRegExp (int nAnswer);
REXA_NDFAState* RegExp ();
REXA_NDFAState* Term ();
REXA_NDFAState* Factor ();
REXA_NDFAState* Atom ();
REXA_NDFAState* Name (REXA_BegEnd nameToFind);
REXA_NDFAState* ParseCharSet (REXA_CharSet& rCharSet);
//NDFA-Automtaton constructors
REXA_NDFAState* Concat (REXA_NDFAState* pState0,REXA_NDFAState* pState1);
REXA_NDFAState* Union (REXA_NDFAState* pState0,REXA_NDFAState* pState1);
REXA_NDFAState* MakeStar (REXA_NDFAState* pState);
REXA_NDFAState* MakePlus (REXA_NDFAState* pState);
REXA_NDFAState* MakeOptional (REXA_NDFAState* pState);
REXA_NDFAState* MakeIgnoreCase (REXA_NDFAState* pState);
//helper functions for grammar related functions
void InsertNamedState( REXA_BegEnd name,
REXA_NDFAState* pState,int nAnwer);
vector<pair<REXA_NDFAState*,REXA_NDFATransition> >
CollectAcceptingTransitions(REXA_NDFAState* pState);
vector<REXA_NDFAState*> CollectAcceptingStates(REXA_NDFAState* pState);
void CollectStates(
REXA_NDFAState* pState,
set<REXA_NDFAState*>& rsetStates);
REXA_NDFAState* CloneNDFA(REXA_NDFAState* pToClone);
REXA_NDFAState* GetNewNDFAState(int id=0,bool isAccepting=false);
int NextStateId ();
REXA_Scanner m_scanner;
static const int m_eAtomSet;
int m_nStateIdMax;
vector<REXA_NDFAState*> m_vecAllStates;
};
const int REXA_ParserImpl::m_eAtomSet= (eCharSet|eIdent|eLParen);
static struct SymbolString{
REXA_ESymbols eSym;
const char* strSym;
}g_symbolStrings[]={
{ eEof, "end of string" },
{ eIllegal, "illegal character" },
{ eStar, "*" },
{ ePlus, "+" },
{ eOpt, "?" },
{ eAlternative,"|" },
{ eCharSet, "[<characters>]" },
{ eLParen, "(" },
{ eRParen, ")" }
};
//-------------------------------------------------------------------------------
REXA_Exception::REXA_Exception(
const char* pPos,
const char* pRegExp,
string strMsg)
//-------------------------------------------------------------------------------
{
m_strErrMsg= strMsg;
m_nErrPos= pPos-pRegExp;
}
//-------------------------------------------------------------------------------
REXA_Exception::REXA_Exception(
REXA_ESymbols eExpectedSymbol,
REXA_BegEnd token,
const char* pRegExp,
string strMsg)
//-------------------------------------------------------------------------------
{
m_strErrMsg= REXA_Scanner::GetSymbolString(eExpectedSymbol)
+ " expected\r\n ";
m_nErrPos= token.first - pRegExp;
}
//-------------------------------------------------------------------------------
string REXA_Scanner::GetSymbolString (REXA_ESymbols eRegexpSymbol)
//-------------------------------------------------------------------------------
{
for(int i=0;i<sizeof(g_symbolStrings)/sizeof(SymbolString);i++){
if( g_symbolStrings[i].eSym==eRegexpSymbol ){
return g_symbolStrings[i].strSym;
}
}
return "";
}
//-------------------------------------------------------------------------------
const char* REXA_Scanner::SkipWhiteSpace ()
//-------------------------------------------------------------------------------
{
const char* pCur= m_pTokEnd;
while(isspace(*pCur))
pCur++;
return pCur;
}
//------------------------------------------------------------------------------
void REXA_Scanner::Match (REXA_ESymbols eSymbol)
//------------------------------------------------------------------------------
{
if( eSymbol!=m_eLastSymbol ){
throw REXA_Exception(eSymbol,GetToken(),m_pcszToParse);
}
}
//------------------------------------------------------------------------------
void REXA_Scanner::AdvanceAndMatch (REXA_ESymbols eSymbol)
//------------------------------------------------------------------------------
{
if( eSymbol!=operator()() ){
throw REXA_Exception(eSymbol,GetToken(),m_pcszToParse);
}
}
//------------------------------------------------------------------------------
int REXA_Scanner::GetChar(const char*& rpFrom,bool& rbIsEscape)
//------------------------------------------------------------------------------
{
rbIsEscape= false;
if( *rpFrom=='\0' ) return -1;
char c;
if( *rpFrom=='\\' ){
rbIsEscape= true;
rpFrom++;
switch(*rpFrom){
case '0': rpFrom++; return '\0';
case 'n': rpFrom++; return '\n';
case 't': rpFrom++; return '\t';
case 'v': rpFrom++; return '\v';
case 'a': rpFrom++; return '\a';
case 'b': rpFrom++; return '\b';
case 'r': rpFrom++; return '\r';
case 'x': c= *rpFrom++;
if( isxdigit(rpFrom[0]) && isxdigit(rpFrom[1]) ){
unsigned hexChar=0;
for(int i=0;i<2;i++){
hexChar*=16;
hexChar+= isdigit(rpFrom[i])?(rpFrom[i]-'0'):(toupper(rpFrom[i])-'A'+16);
}
c= (char)hexChar;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -