⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 rexalgorithm.cpp

📁 Compiles a regular expression into a fast automaton 编译正规表达式成机器代码(源码)(11KB)
💻 CPP
📖 第 1 页 / 共 4 页
字号:
{
    if( eSymbol!=operator()() ){
        throw REXA_Exception(eSymbol,GetToken(),m_pcszToParse);
    }
}
//------------------------------------------------------------------------------
char          REXA_Scanner::GetChar(const char*& rpFrom,bool& rbIsEscape)
//------------------------------------------------------------------------------
{
    rbIsEscape= false;
    if( *rpFrom=='\0' ) return '\0';
    char c;
    if( *rpFrom=='\\' ){
        rbIsEscape= true;
        rpFrom++;
        switch(*rpFrom){
        case '0':  return '\0';
        case 'n':  rpFrom++; return '\n';
        case 't':  rpFrom++; return '\t';
        case 'v':  rpFrom++; return '\v';
        case 'a':  rpFrom++; return '\a';
        case 'b':  rpFrom++; return '\b';
        case 'r':  rpFrom++; return '\r';
        case 'x':  c= *rpFrom++; 
            if( isxdigit(rpFrom[0]) && isxdigit(rpFrom[1]) ){
                unsigned hexChar=0;
                for(int i=0;i<2;i++){
                    hexChar*=16;
                    hexChar+= isdigit(rpFrom[i])?(rpFrom[i]-'0'):(toupper(rpFrom[i])-'A'+16);
                }
                c= (char)hexChar;
                rpFrom+= 2;
            }
            return c;
        default:    c= *rpFrom++; return c;
        }
    }
    return *rpFrom++;
}
//------------------------------------------------------------------------------
void    REXA_Scanner::AddRange(
                           unsigned char cFirst, 
                           unsigned char cLast, 
                           REXA_CharSet& rCharSet)
//------------------------------------------------------------------------------
{
    if( cFirst>cLast ){
        swap(cFirst,cLast);
    }
    for(unsigned char c= cFirst; c<cLast;c++){
        rCharSet[c]= true;
    }
    rCharSet[cLast]= true;
}
//------------------------------------------------------------------------------
REXA_ESymbols REXA_Scanner::operator()	()
//------------------------------------------------------------------------------
{
	m_pTokBeg= m_pTokEnd= SkipWhiteSpace();
    switch(*m_pTokEnd){
    case '\0':  return m_eLastSymbol=eEof;
    case '+':   m_pTokEnd++; return m_eLastSymbol=ePlus;
    case '?':   m_pTokEnd++; return m_eLastSymbol=eOpt;
    case '*':   m_pTokEnd++; return m_eLastSymbol=eStar;
    case '(':   m_pTokEnd++; return m_eLastSymbol=eLParen;
    case ')':   m_pTokEnd++; return m_eLastSymbol=eRParen;
    case '=':   m_pTokEnd++; return m_eLastSymbol=eAssign;
	case '|':   m_pTokEnd++; return m_eLastSymbol=eAlternative;
    case '[': {
                m_pTokEnd++;
                REXA_CharSet cset;
                char c;
                bool bIsEscape;
				bool bIsFirstTime= true;
                bool bPrevWasRangeSeparator= false;
				bool bUseComplement= false;
                char cPrev;
				cPrev=GetChar(m_pTokEnd,bIsEscape);
				if( cPrev=='^' && !bIsEscape ){
					bUseComplement= true;
                    cPrev=GetChar(m_pTokEnd,bIsEscape);
				}
				while(	cPrev	&&	!(cPrev==']' && !bIsEscape) ){
					c=GetChar(m_pTokEnd,bIsEscape);
					if( !bIsEscape && c=='-'  ){
						c=GetChar(m_pTokEnd,bIsEscape);
                        if(!c){
                            break;
                        }
						AddRange(
							(unsigned char)cPrev,(unsigned char)c,cset);
						cPrev= GetChar(m_pTokEnd,bIsEscape);
					}else{
                        cset[(unsigned char)cPrev]= true;
                        cPrev= c;
					}
				}
                if( !cPrev  || !c ){
                    throw REXA_Exception(
                                m_pTokBeg,m_pcszToParse,"closing ] missing");
                }
                m_charSet= cset;
				if(bUseComplement){
					m_charSet.flip();
				}
                return m_eLastSymbol=eCharSet;
              }
    case '.':{
                m_pTokEnd++;
                m_charSet= REXA_CharSet();
                for(int i=0;i<256;i++){
                    m_charSet[i]= true;
                }
                return m_eLastSymbol=eCharSet;
             }
    case '$':{
                if(  isalpha(m_pTokEnd[1])||m_pTokEnd[1]=='_' ){
                    m_pTokEnd++;
                    m_strLiteral="";
                    while(  isalpha(*m_pTokEnd)||isdigit(*m_pTokEnd)||m_pTokEnd[1]=='_' ){
                        m_strLiteral+= *m_pTokEnd++;
                    }
                    return m_eLastSymbol=eIdent;
                 }
             }
    default:
        {
            if( *m_pTokEnd=='\\' && m_pTokEnd[1]=='i' ){
                m_pTokEnd++; 
                m_pTokEnd++;
                return m_eLastSymbol=eIgnoreCase;
            }else{
                bool bIsEscape;
                char c=GetChar(m_pTokEnd,bIsEscape);
                m_charSet= REXA_CharSet();
                m_charSet[(unsigned char)c]= true;
                return m_eLastSymbol=eCharSet;
            }
        }
    }
}

///////////////////////////////////////////////////////////////////////////////
// grammar related functions
///////////////////////////////////////////////////////////////////////////////
//-----------------------------------------------------------------------------
REXA_ParserImpl::~REXA_ParserImpl()
//-----------------------------------------------------------------------------
{
    for(int i=0;i<m_vecAllStates.size();++i){
        delete m_vecAllStates[i];
    }
}

//-----------------------------------------------------------------------------
void		REXA_ParserImpl::InsertNamedState(REXA_BegEnd name,
										   REXA_NDFAState* pState,int nAnswer)
//-----------------------------------------------------------------------------
{
	vector<REXA_NDFAState*>  vecAccepting= CollectAcceptingStates(pState);
	for(int i=0;i<vecAccepting.size();i++){
		vecAccepting[i]->m_nAnswer= nAnswer;
	}
    pair<map<string,REXA_NDFAState*>::iterator,bool> itBool=
        m_mapNamedStates.insert(make_pair(string(name.first,name.second),pState));
    if( !itBool.second ){
        itBool.first->second= pState;
    }
}
//-----------------------------------------------------------------------------
void REXA_ParserImpl::NamedRegExp		(int nAnswer)
//-----------------------------------------------------------------------------
//$1 NamedRegExp= Ident "=" RegExp.
{
	m_scanner.Match(eIdent);
	REXA_BegEnd name= m_scanner.GetToken();

	m_scanner.AdvanceAndMatch(eAssign);
    m_scanner();
	REXA_NDFAState* pState=RegExp();
    if( m_scanner.GetLastSymbol()!=eEof){
         throw REXA_Exception(
                        m_scanner.m_pTokBeg,
                        m_scanner.m_pcszToParse,
                        "unexpected symbol");
    }
	InsertNamedState(name,pState,nAnswer);
}
//-----------------------------------------------------------------------------
const REXA_NDFAState*  REXA_ParserImpl::ParseRegExp          (string strRegExp)
//-----------------------------------------------------------------------------
{
    m_scanner.SetSource(strRegExp.c_str());
    m_scanner();
    const REXA_NDFAState* cRegExp= RegExp();
    if( m_scanner.GetLastSymbol()!=eEof){
         throw REXA_Exception(
                        m_scanner.m_pTokBeg,
                        m_scanner.m_pcszToParse,
                        "unexpected symbol");
    }
    return cRegExp;
}
//-----------------------------------------------------------------------------
bool       REXA_ParserImpl::ParseRegExpDefinition(string strRegExpDefiniton,
											   int nAnswer)
//-----------------------------------------------------------------------------
{
    m_scanner.SetSource(strRegExpDefiniton.c_str());
    m_scanner();
    NamedRegExp(nAnswer);
    return true;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::RegExp			()
//-----------------------------------------------------------------------------
//$2 RegExp= Term {"|" Term}.
{
    REXA_NDFAState* pState= Term();
    while( m_scanner.GetLastSymbol()==eAlternative ){
        m_scanner();
        pState= Union(pState,Term());
    }
    return pState;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::Term			()
//-----------------------------------------------------------------------------
//$3 Term= Factor {Factor}.
{
    REXA_NDFAState* pState= Factor();
    while( m_scanner.GetLastSymbol()&m_eAtomSet ){
        pState= Concat(pState,Factor());
    }
    return pState;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::Factor			()
//-----------------------------------------------------------------------------
//$4 Factor= Atom ["*"|"+"|"?"|"\i"].
{
    REXA_NDFAState* pState= Atom();
    switch(m_scanner.GetLastSymbol()){
    case eStar:
        {
            pState= MakeStar(pState);
            m_scanner();
            return pState;
        }
    case ePlus:
        {
            pState= MakePlus(pState);
            m_scanner();
            return pState;
        }
    case eOpt:
        {
            pState= MakeOptional(pState);
            m_scanner();
            return pState;
        }
    case eIgnoreCase:
        {
            pState= MakeIgnoreCase(pState);
            m_scanner();
            return pState;
        }
    default:
        return pState;
    }
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::Atom				()
//-----------------------------------------------------------------------------
//$5 Atom= REXA_CharSet |  Ident | "(" RegExp ").
{
    switch(m_scanner.GetLastSymbol()){
    case eCharSet:
        {
			REXA_NDFAState* pState= ParseCharSet(m_scanner.GetCharSet());
            m_scanner();
            return pState;
        }

    case eIdent:
        {
            REXA_NDFAState* pState= Name(m_scanner.GetToken());
			m_scanner();
            return pState;
        }
    case eLParen:
        {
			m_scanner();
            REXA_NDFAState* pState= RegExp();
            m_scanner.Match(eRParen);
            m_scanner();
            return pState;
        }
    default: throw REXA_Exception(
                        m_scanner.m_pTokBeg,
                        m_scanner.m_pcszToParse,
                        "[,',( or ident expected");
        
    }
    return 0;
}
//-----------------------------------------------------------------------------
REXA_NDFAState* REXA_ParserImpl::Name             (REXA_BegEnd nameToFind)
//-----------------------------------------------------------------------------
{

	map<string,REXA_NDFAState*>::iterator it=
		m_mapNamedStates.find(string(nameToFind.first,nameToFind.second));
	if( it!= m_mapNamedStates.end() ){
		return CloneNDFA(it->second);;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -