regxparser.cpp
来自「IBM的解析xml的工具Xerces的源代码」· C++ 代码 · 共 1,465 行 · 第 1/3 页
CPP
1,465 行
/* * Copyright 2003,2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *//* * $Log: RegxParser.cpp,v $ * Revision 1.12 2004/09/08 13:56:47 peiyongz * Apache License Version 2.0 * * Revision 1.11 2004/01/29 11:51:21 cargilld * Code cleanup changes to get rid of various compiler diagnostic messages. * * Revision 1.10 2003/12/17 00:18:37 cargilld * Update to memory management so that the static memory manager (one used to call Initialize) is only for static data. * * Revision 1.9 2003/05/18 14:02:06 knoaman * Memory manager implementation: pass per instance manager. * * Revision 1.8 2003/05/16 00:03:10 knoaman * Partial implementation of the configurable memory manager. * * Revision 1.7 2003/05/15 18:42:55 knoaman * Partial implementation of the configurable memory manager. * * Revision 1.6 2003/03/18 19:38:28 knoaman * Schema Errata E2-18 + misc. regex fixes. * * Revision 1.5 2003/03/04 16:36:17 knoaman * RegEx: fix for character category escape * * Revision 1.4 2003/01/13 19:02:23 knoaman * [Bug 14390] C++ Indentifier collision with Python. * * Revision 1.3 2002/11/04 15:17:00 tng * C++ Namespace Support. * * Revision 1.2 2002/03/18 19:29:53 knoaman * Change constant names to eliminate possible conflict with user defined ones. * * Revision 1.1.1.1 2002/02/01 22:22:30 peiyongz * sane_include * * Revision 1.10 2001/11/20 20:48:10 knoaman * Fix for invalid repeating quantifier check. * * Revision 1.9 2001/11/16 15:56:37 knoaman * Add check for invalid repeating quantifier. * * Revision 1.8 2001/09/20 13:11:42 knoaman * Regx + misc. fixes * * Revision 1.7 2001/08/31 16:53:41 knoaman * Misc. fixes. * * Revision 1.6 2001/07/26 12:46:48 knoaman * Fix for bug 2815. * * Revision 1.5 2001/06/06 13:49:27 jberry * Fix two improper NULL tests * * Revision 1.4 2001/05/11 21:51:01 knoaman * Schema updates and fixes. * * Revision 1.3 2001/05/11 13:26:48 tng * Copyright update. * * Revision 1.2 2001/05/03 18:17:45 knoaman * Some design changes: * o Changed the TokenFactory from a single static instance, to a * normal class. Each RegularExpression object will have its own * instance of TokenFactory, and that instance will be passed to * other classes that need to use a TokenFactory to create Token * objects (with the exception of RangeTokenMap). * o Added a new class RangeTokenMap to map a the different ranges * in a given category to a specific RangeFactory object. In the old * design RangeFactory had dual functionality (act as a Map, and as * a factory for creating RangeToken(s)). The RangeTokenMap will * have its own copy of the TokenFactory. There will be only one * instance of the RangeTokenMap class, and that instance will be * lazily deleted when XPlatformUtils::Terminate is called. * * Revision 1.1 2001/03/02 19:22:54 knoaman * Schema: Regular expression handling part I * */// ---------------------------------------------------------------------------// Includes// ---------------------------------------------------------------------------#include <xercesc/util/regx/RegxParser.hpp>#include <xercesc/util/XMLString.hpp>#include <xercesc/util/ParseException.hpp>#include <xercesc/util/regx/RegularExpression.hpp>#include <xercesc/util/regx/RegxUtil.hpp>#include <xercesc/util/regx/RegxDefs.hpp>#include <xercesc/util/regx/TokenInc.hpp>#include <xercesc/framework/XMLErrorCodes.hpp>XERCES_CPP_NAMESPACE_BEGIN// ---------------------------------------------------------------------------// Static member data initialization// ---------------------------------------------------------------------------const unsigned short RegxParser::S_NORMAL = 0;const unsigned short RegxParser::S_INBRACKETS = 1;const unsigned short RegxParser::S_INXBRACKETS = 2;// ---------------------------------------------------------------------------// RegxParser::ReferencePostion: Constructors and Destructor// ---------------------------------------------------------------------------RegxParser::ReferencePosition::ReferencePosition(const int refNo, const int position) :fReferenceNo(refNo) , fPosition(position){}// ---------------------------------------------------------------------------// RegxParser: Constructors and Destructors// ---------------------------------------------------------------------------RegxParser::RegxParser(MemoryManager* const manager) :fMemoryManager(manager), fHasBackReferences(false), fOptions(0), fOffset(0), fNoGroups(1), fParseContext(S_NORMAL), fStringLen(0), fState(0), fCharData(0), fString(0), fReferences(0), fTokenFactory(0){}RegxParser::~RegxParser() { fMemoryManager->deallocate(fString);//delete [] fString; delete fReferences;}// ---------------------------------------------------------------------------// RegxParser: Parsing methods// ---------------------------------------------------------------------------Token* RegxParser::parse(const XMLCh* const regxStr, const int options) { // if TokenFactory is not set do nothing. // REVISIT - should we throw an exception if (fTokenFactory == 0) { return 0; } XMLMutexLock lockInit(&fMutex); fOptions = options; fOffset = 0; fNoGroups = 1; fHasBackReferences = false; setParseContext(S_NORMAL); if (fString) fMemoryManager->deallocate(fString);//delete [] fString; fString = XMLString::replicate(regxStr, fMemoryManager); if (isSet(RegularExpression::EXTENDED_COMMENT)) { if (fString) fMemoryManager->deallocate(fString);//delete [] fString; fString = RegxUtil::stripExtendedComment(regxStr, fMemoryManager); } fStringLen = XMLString::stringLen(fString); processNext(); Token* retTok = parseRegx(); if (fOffset != fStringLen) { ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Parse1, fMemoryManager); } if (fReferences != 0) { unsigned int refSize = fReferences->size(); for (unsigned int i = 0; i < refSize; i++) { if (fNoGroups <= fReferences->elementAt(i)->fReferenceNo) { ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Parse2, fMemoryManager); } } fReferences->removeAllElements(); } return retTok;}void RegxParser::processNext() { if (fOffset >= fStringLen) { fCharData = -1; fState = REGX_T_EOF; return; } unsigned short nextState; XMLCh ch = fString[fOffset++]; fCharData = ch; if (fParseContext == S_INBRACKETS) { switch (ch) { case chBackSlash: nextState = REGX_T_BACKSOLIDUS; if (fOffset >= fStringLen) { ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager); } fCharData = fString[fOffset++]; break; case chDash: if (isSet(RegularExpression::XMLSCHEMA_MODE) && fOffset < fStringLen && fString[fOffset] == chOpenSquare) { fOffset++; nextState = REGX_T_XMLSCHEMA_CC_SUBTRACTION; } else { nextState = REGX_T_CHAR; } break; case chOpenSquare: if (!isSet(RegularExpression::XMLSCHEMA_MODE) && fOffset < fStringLen && fString[fOffset] == chColon) { fOffset++; nextState = REGX_T_POSIX_CHARCLASS_START; break; } // Through down default: if (RegxUtil::isHighSurrogate(ch) && fOffset < fStringLen) { XMLCh lowCh = fString[fOffset]; if (RegxUtil::isLowSurrogate(lowCh)) { fCharData = RegxUtil::composeFromSurrogate(ch, lowCh); fOffset++; } else { throw XMLErrs::Expected2ndSurrogateChar; } } nextState = REGX_T_CHAR; } fState = nextState; return; } switch (ch) { case chPipe: nextState = REGX_T_OR; break; case chAsterisk: nextState = REGX_T_STAR; break; case chPlus: nextState = REGX_T_PLUS; break; case chQuestion: nextState = REGX_T_QUESTION; break; case chCloseParen: nextState = REGX_T_RPAREN; break; case chPeriod: nextState = REGX_T_DOT; break; case chOpenSquare: nextState = REGX_T_LBRACKET; break; case chCaret: nextState = REGX_T_CARET; break; case chDollarSign: nextState = REGX_T_DOLLAR; break; case chOpenParen: { nextState = REGX_T_LPAREN; if (fOffset >= fStringLen) break; if (fString[fOffset] != chQuestion) break; if (++fOffset >= fStringLen) ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next2, fMemoryManager); ch = fString[fOffset++]; switch (ch) { case chColon: nextState = REGX_T_LPAREN2; break; case chEqual: nextState = REGX_T_LOOKAHEAD; break; case chBang: nextState = REGX_T_NEGATIVELOOKAHEAD; break; case chOpenSquare: nextState = REGX_T_SET_OPERATIONS; break; case chCloseAngle: nextState = REGX_T_INDEPENDENT; break; case chOpenAngle: if (fOffset >= fStringLen) ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next2, fMemoryManager); ch = fString[fOffset++]; if (ch == chEqual) { nextState = REGX_T_LOOKBEHIND; } else if (ch == chBang) { nextState = REGX_T_NEGATIVELOOKBEHIND; } else { ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next3, fMemoryManager); } break; case chPound: while (fOffset < fStringLen) { ch = fString[fOffset++]; if (ch == chCloseParen) break; } if (ch != chCloseParen) ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next4, fMemoryManager); nextState = REGX_T_COMMENT; break; default: if (ch == chDash || chLatin_a <= ch && ch <= chLatin_z || chLatin_A <= ch && ch <= chLatin_Z) { // Options fOffset--; nextState = REGX_T_MODIFIERS; break; } else if (ch == chOpenParen) { nextState = REGX_T_CONDITION; break; } ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next2, fMemoryManager); } } break; case chBackSlash: nextState = REGX_T_BACKSOLIDUS; if (fOffset >= fStringLen) { ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager); } fCharData = fString[fOffset++]; break; default: nextState = REGX_T_CHAR; if (RegxUtil::isHighSurrogate(ch) && fOffset < fStringLen) { XMLCh lowCh = fString[fOffset]; if (RegxUtil::isLowSurrogate(lowCh)) { fCharData = RegxUtil::composeFromSurrogate(ch, lowCh); fOffset++; } else { throw XMLErrs::Expected2ndSurrogateChar; } } } fState = nextState;}Token* RegxParser::parseRegx(const bool matchingRParen) { Token* tok = parseTerm(matchingRParen); Token* parentTok = 0; while (fState == REGX_T_OR) { processNext(); if (parentTok == 0) { parentTok = fTokenFactory->createUnion(); parentTok->addChild(tok, fTokenFactory); tok = parentTok; } tok->addChild(parseTerm(matchingRParen), fTokenFactory); } return tok;}Token* RegxParser::parseTerm(const bool matchingRParen) { unsigned short state = fState; if (state == REGX_T_OR || state == REGX_T_EOF || (state == REGX_T_RPAREN && matchingRParen)) { return fTokenFactory->createToken(Token::T_EMPTY); } else { Token* tok = parseFactor(); Token* concatTok = 0; while ((state = fState) != REGX_T_OR && state != REGX_T_EOF && (state != REGX_T_RPAREN || !matchingRParen)) { if (concatTok == 0) { concatTok = fTokenFactory->createUnion(true); concatTok->addChild(tok, fTokenFactory); tok = concatTok; } concatTok->addChild(parseFactor(), fTokenFactory); } return tok; }}Token* RegxParser::processCaret() { processNext(); return fTokenFactory->getLineBegin();}Token* RegxParser::processDollar() { processNext(); return fTokenFactory->getLineEnd();}Token* RegxParser::processLook(const unsigned short tokType) { processNext(); Token* tok = fTokenFactory->createLook(tokType, parseRegx()); if (fState != REGX_T_RPAREN) { ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager); } processNext(); return tok;}Token* RegxParser::processBacksolidus_A() {
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?