regxparser.cpp

来自「IBM的解析xml的工具Xerces的源代码」· C++ 代码 · 共 1,465 行 · 第 1/3 页

CPP
1,465
字号
/* * Copyright 2003,2004 The Apache Software Foundation. *  * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at *  *      http://www.apache.org/licenses/LICENSE-2.0 *  * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *//* * $Log: RegxParser.cpp,v $ * Revision 1.12  2004/09/08 13:56:47  peiyongz * Apache License Version 2.0 * * Revision 1.11  2004/01/29 11:51:21  cargilld * Code cleanup changes to get rid of various compiler diagnostic messages. * * Revision 1.10  2003/12/17 00:18:37  cargilld * Update to memory management so that the static memory manager (one used to call Initialize) is only for static data. * * Revision 1.9  2003/05/18 14:02:06  knoaman * Memory manager implementation: pass per instance manager. * * Revision 1.8  2003/05/16 00:03:10  knoaman * Partial implementation of the configurable memory manager. * * Revision 1.7  2003/05/15 18:42:55  knoaman * Partial implementation of the configurable memory manager. * * Revision 1.6  2003/03/18 19:38:28  knoaman * Schema Errata E2-18 + misc. regex fixes. * * Revision 1.5  2003/03/04 16:36:17  knoaman * RegEx: fix for character category escape * * Revision 1.4  2003/01/13 19:02:23  knoaman * [Bug 14390] C++ Indentifier collision with Python. * * Revision 1.3  2002/11/04 15:17:00  tng * C++ Namespace Support. * * Revision 1.2  2002/03/18 19:29:53  knoaman * Change constant names to eliminate possible conflict with user defined ones. * * Revision 1.1.1.1  2002/02/01 22:22:30  peiyongz * sane_include * * Revision 1.10  2001/11/20 20:48:10  knoaman * Fix for invalid repeating quantifier check. * * Revision 1.9  2001/11/16 15:56:37  knoaman * Add check for invalid repeating quantifier. * * Revision 1.8  2001/09/20 13:11:42  knoaman * Regx  + misc. fixes * * Revision 1.7  2001/08/31 16:53:41  knoaman * Misc. fixes. * * Revision 1.6  2001/07/26 12:46:48  knoaman * Fix for bug 2815. * * Revision 1.5  2001/06/06 13:49:27  jberry * Fix two improper NULL tests * * Revision 1.4  2001/05/11 21:51:01  knoaman * Schema updates and fixes. * * Revision 1.3  2001/05/11 13:26:48  tng * Copyright update. * * Revision 1.2  2001/05/03 18:17:45  knoaman * Some design changes: * o Changed the TokenFactory from a single static instance, to a *    normal class. Each RegularExpression object will have its own *    instance of TokenFactory, and that instance will be passed to *    other classes that need to use a TokenFactory to create Token *    objects (with the exception of RangeTokenMap). * o Added a new class RangeTokenMap to map a the different ranges *    in a given category to a specific RangeFactory object. In the old *    design RangeFactory had dual functionality (act as a Map, and as *    a factory for creating RangeToken(s)). The RangeTokenMap will *    have its own copy of the TokenFactory. There will be only one *    instance of the RangeTokenMap class, and that instance will be *    lazily deleted when XPlatformUtils::Terminate is called. * * Revision 1.1  2001/03/02 19:22:54  knoaman * Schema: Regular expression handling part I * */// ---------------------------------------------------------------------------//  Includes// ---------------------------------------------------------------------------#include <xercesc/util/regx/RegxParser.hpp>#include <xercesc/util/XMLString.hpp>#include <xercesc/util/ParseException.hpp>#include <xercesc/util/regx/RegularExpression.hpp>#include <xercesc/util/regx/RegxUtil.hpp>#include <xercesc/util/regx/RegxDefs.hpp>#include <xercesc/util/regx/TokenInc.hpp>#include <xercesc/framework/XMLErrorCodes.hpp>XERCES_CPP_NAMESPACE_BEGIN// ---------------------------------------------------------------------------//  Static member data initialization// ---------------------------------------------------------------------------const unsigned short RegxParser::S_NORMAL		= 0;const unsigned short RegxParser::S_INBRACKETS	= 1;const unsigned short RegxParser::S_INXBRACKETS	= 2;// ---------------------------------------------------------------------------//  RegxParser::ReferencePostion: Constructors and Destructor// ---------------------------------------------------------------------------RegxParser::ReferencePosition::ReferencePosition(const int refNo,						 const int position)	:fReferenceNo(refNo)	, fPosition(position){}// ---------------------------------------------------------------------------//  RegxParser: Constructors and Destructors// ---------------------------------------------------------------------------RegxParser::RegxParser(MemoryManager* const manager)    :fMemoryManager(manager),     fHasBackReferences(false),     fOptions(0),     fOffset(0),     fNoGroups(1),     fParseContext(S_NORMAL),     fStringLen(0),     fState(0),     fCharData(0),     fString(0),     fReferences(0),     fTokenFactory(0){}RegxParser::~RegxParser() {	fMemoryManager->deallocate(fString);//delete [] fString;	delete fReferences;}// ---------------------------------------------------------------------------//  RegxParser: Parsing methods// ---------------------------------------------------------------------------Token* RegxParser::parse(const XMLCh* const regxStr, const int options) {    // if TokenFactory is not set do nothing.    // REVISIT - should we throw an exception    if (fTokenFactory == 0) {        return 0;    }	XMLMutexLock lockInit(&fMutex);	fOptions = options;	fOffset = 0;	fNoGroups = 1;	fHasBackReferences = false;	setParseContext(S_NORMAL);	if (fString)        fMemoryManager->deallocate(fString);//delete [] fString;	fString = XMLString::replicate(regxStr, fMemoryManager);	if (isSet(RegularExpression::EXTENDED_COMMENT)) {        if (fString)            fMemoryManager->deallocate(fString);//delete [] fString;		fString = RegxUtil::stripExtendedComment(regxStr, fMemoryManager);    }    fStringLen = XMLString::stringLen(fString);    processNext();    Token* retTok = parseRegx();	if (fOffset != fStringLen) {        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Parse1, fMemoryManager);    }    if (fReferences != 0) {		unsigned int refSize = fReferences->size();        for (unsigned int i = 0; i < refSize; i++) {			if (fNoGroups <= fReferences->elementAt(i)->fReferenceNo) {                ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Parse2, fMemoryManager);            }        }		fReferences->removeAllElements();    }    return retTok;}void RegxParser::processNext() {    if (fOffset >= fStringLen) {        fCharData = -1;        fState = REGX_T_EOF;        return;	}    unsigned short nextState;	XMLCh ch = fString[fOffset++];	fCharData = ch;    if (fParseContext == S_INBRACKETS) {		switch (ch) {        case chBackSlash:            nextState = REGX_T_BACKSOLIDUS;			if (fOffset >= fStringLen) {				ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager);			}			fCharData = fString[fOffset++];			break;		case chDash:            if (isSet(RegularExpression::XMLSCHEMA_MODE)                && fOffset < fStringLen && fString[fOffset] == chOpenSquare) {                fOffset++;                nextState = REGX_T_XMLSCHEMA_CC_SUBTRACTION;            }            else {                nextState = REGX_T_CHAR;            }            break;        case chOpenSquare:            if (!isSet(RegularExpression::XMLSCHEMA_MODE)                && fOffset < fStringLen && fString[fOffset] == chColon) {                fOffset++;                nextState = REGX_T_POSIX_CHARCLASS_START;                break;			} // Through down        default:            if (RegxUtil::isHighSurrogate(ch) && fOffset < fStringLen) {                XMLCh lowCh = fString[fOffset];                if (RegxUtil::isLowSurrogate(lowCh)) {                    fCharData = RegxUtil::composeFromSurrogate(ch, lowCh);					fOffset++;                }				else {                    throw XMLErrs::Expected2ndSurrogateChar;                }            }			nextState = REGX_T_CHAR;        }        fState = nextState;        return;    }    switch (ch) {    case chPipe:        nextState = REGX_T_OR;        break;    case chAsterisk:        nextState = REGX_T_STAR;        break;    case chPlus:        nextState = REGX_T_PLUS;        break;    case chQuestion:		nextState = REGX_T_QUESTION;		break;    case chCloseParen:        nextState = REGX_T_RPAREN;        break;    case chPeriod:		nextState = REGX_T_DOT;		break;    case chOpenSquare:        nextState = REGX_T_LBRACKET;        break;    case chCaret:        nextState = REGX_T_CARET;        break;    case chDollarSign:		nextState = REGX_T_DOLLAR;		break;	case chOpenParen:        {		    nextState = REGX_T_LPAREN;            if (fOffset >= fStringLen)                break;			if (fString[fOffset] != chQuestion)                break;            if (++fOffset >= fStringLen)                ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next2, fMemoryManager);            ch = fString[fOffset++];            switch (ch) {            case chColon:                nextState = REGX_T_LPAREN2;                break;			case chEqual:                nextState = REGX_T_LOOKAHEAD;                break;            case chBang:                nextState = REGX_T_NEGATIVELOOKAHEAD;                break;            case chOpenSquare:                nextState = REGX_T_SET_OPERATIONS;                break;            case chCloseAngle:                nextState = REGX_T_INDEPENDENT;				break;            case chOpenAngle:				if (fOffset >= fStringLen)					ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next2, fMemoryManager);				ch = fString[fOffset++];				if (ch == chEqual) {					nextState = REGX_T_LOOKBEHIND;				}				else if (ch == chBang) {					nextState = REGX_T_NEGATIVELOOKBEHIND;				}				else {					ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next3, fMemoryManager);				}				break;            case chPound:				while (fOffset < fStringLen) {					ch = fString[fOffset++];					if (ch == chCloseParen)						break;				}				if (ch != chCloseParen)					ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next4, fMemoryManager);				nextState = REGX_T_COMMENT;				break;            default:				if (ch == chDash || chLatin_a <= ch && ch <= chLatin_z                    || chLatin_A <= ch && ch <= chLatin_Z) { // Options                    fOffset--;                    nextState = REGX_T_MODIFIERS;                    break;                }                else if (ch == chOpenParen) {                    nextState = REGX_T_CONDITION;                    break;                }                ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next2, fMemoryManager);            }        }		break;	case chBackSlash:        nextState = REGX_T_BACKSOLIDUS;        if (fOffset >= fStringLen) {			ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager);        }        fCharData = fString[fOffset++];        break;	default:		nextState = REGX_T_CHAR;		if (RegxUtil::isHighSurrogate(ch) && fOffset < fStringLen) {                XMLCh lowCh = fString[fOffset];                if (RegxUtil::isLowSurrogate(lowCh)) {                    fCharData = RegxUtil::composeFromSurrogate(ch, lowCh);					fOffset++;                }				else {                    throw XMLErrs::Expected2ndSurrogateChar;                }            }	}	fState = nextState;}Token* RegxParser::parseRegx(const bool matchingRParen) {    Token* tok = parseTerm(matchingRParen);    Token* parentTok = 0;    while (fState == REGX_T_OR) {        processNext();        if (parentTok == 0) {            parentTok = fTokenFactory->createUnion();		    parentTok->addChild(tok, fTokenFactory);            tok = parentTok;        }        tok->addChild(parseTerm(matchingRParen), fTokenFactory);    }    return tok;}Token* RegxParser::parseTerm(const bool matchingRParen) {    unsigned short state = fState;    if (state == REGX_T_OR || state == REGX_T_EOF        || (state == REGX_T_RPAREN && matchingRParen)) {        return fTokenFactory->createToken(Token::T_EMPTY);    }    else {        Token* tok = parseFactor();        Token* concatTok = 0;        while ((state = fState) != REGX_T_OR && state != REGX_T_EOF               && (state != REGX_T_RPAREN || !matchingRParen))        {            if (concatTok == 0) {                concatTok = fTokenFactory->createUnion(true);                concatTok->addChild(tok, fTokenFactory);                tok = concatTok;            }            concatTok->addChild(parseFactor(), fTokenFactory);        }        return tok;    }}Token* RegxParser::processCaret() {    processNext();	return fTokenFactory->getLineBegin();}Token* RegxParser::processDollar() {    processNext();    return fTokenFactory->getLineEnd();}Token* RegxParser::processLook(const unsigned short tokType) {    processNext();	Token* tok = fTokenFactory->createLook(tokType, parseRegx());    if (fState != REGX_T_RPAREN) {        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager);    }    processNext();    return tok;}Token* RegxParser::processBacksolidus_A() {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?