regxparser.cpp

来自「IBM的解析xml的工具Xerces的源代码」· C++ 代码 · 共 1,465 行 · 第 1/3 页

CPP
1,465
字号
    case REGX_T_BACKSOLIDUS:		switch(fCharData) {        case chLatin_d:        case chLatin_D:        case chLatin_w:        case chLatin_W:        case chLatin_s:        case chLatin_S:            tok = getTokenForShorthand(fCharData);            processNext();            return tok;        case chLatin_c:            return processBacksolidus_c();        case chLatin_C:            return processBacksolidus_C();        case chLatin_i:            return processBacksolidus_i();        case chLatin_I:            return processBacksolidus_I();        case chLatin_g:            return processBacksolidus_g();        case chLatin_X:            return processBacksolidus_X();        case chDigit_0:        case chDigit_1:        case chDigit_2:        case chDigit_3:        case chDigit_4:        case chDigit_5:        case chDigit_6:        case chDigit_7:        case chDigit_8:        case chDigit_9:            return processBackReference();        case chLatin_p:        case chLatin_P:			{								tok = processBacksolidus_pP(fCharData);				if (tok == 0) {					ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, fMemoryManager);				}			}            break;        default:            {                XMLInt32 ch = decodeEscaped();                if (ch < 0x10000) {                    tok = fTokenFactory->createChar(ch);                }                else {                    XMLCh* surrogateStr = RegxUtil::decomposeToSurrogates(ch, fMemoryManager);				    ArrayJanitor<XMLCh> janSurrogate(surrogateStr, fMemoryManager);				    tok = fTokenFactory->createString(surrogateStr);                }            }			break;		} // end switch        processNext();        break;    case REGX_T_CHAR:        if (fCharData == chOpenCurly            || fCharData == chCloseCurly            || fCharData == chCloseSquare)            ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager);        tok = fTokenFactory->createChar(fCharData);        processNext();        break;    default:        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager);    } //end switch    return tok;}RangeToken* RegxParser::processBacksolidus_pP(const XMLInt32 ch) {    processNext();    if (fState != REGX_T_CHAR || fCharData != chOpenCurly)        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom2, fMemoryManager);    int nameStart = fOffset;    int nameEnd = XMLString::indexOf(fString,chCloseCurly,nameStart, fMemoryManager);    if (nameEnd < 0)        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom3, fMemoryManager);        fOffset = nameEnd + 1;    XMLCh* rangeName = (XMLCh*) fMemoryManager->allocate    (        (nameEnd - nameStart + 1) * sizeof(XMLCh)    );//new XMLCh[(nameEnd - nameStart) + 1];    ArrayJanitor<XMLCh> janRangeName(rangeName, fMemoryManager);    XMLString::subString(rangeName, fString, nameStart, nameEnd, fMemoryManager);    return  fTokenFactory->getRange(rangeName, !(ch == chLatin_p));}XMLInt32 RegxParser::processCInCharacterClass(RangeToken* const,                                              const XMLInt32) {	return decodeEscaped();}RangeToken* RegxParser::parseCharacterClass(const bool useNRange) {    setParseContext(S_INBRACKETS);	processNext();    RangeToken* base = 0;    RangeToken* tok = 0;    bool nRange = false;	if (fState == REGX_T_CHAR && fCharData == chCaret) {        nRange = true;        processNext();		if (useNRange) {            tok = fTokenFactory->createRange(true);        }        else {			base = fTokenFactory->createRange();            base->addRange(0, Token::UTF16_MAX);            tok = fTokenFactory->createRange();        }    }    else {        tok = fTokenFactory->createRange();    }    bool firstLoop = true;    while (fState != REGX_T_EOF) {        if (fState == REGX_T_CHAR && fCharData == chCloseSquare && !firstLoop)			break;        bool end = false;        XMLInt32 ch = fCharData;        firstLoop = false;        if (fState == REGX_T_BACKSOLIDUS) {            switch(ch) {            case chLatin_d:            case chLatin_D:            case chLatin_w:            case chLatin_W:            case chLatin_s:            case chLatin_S:                tok->mergeRanges(getTokenForShorthand(ch));                end = true;				break;            case chLatin_i:            case chLatin_I:            case chLatin_c:            case chLatin_C:                ch = processCInCharacterClass(tok, ch);				if (ch < 0){				    end = true;                }                break;            case chLatin_p:            case chLatin_P:				{										RangeToken* tok2 = processBacksolidus_pP(ch);					if (tok2 == 0) {						ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, fMemoryManager);					}					tok->mergeRanges(tok2);					end = true;				}                break;            default:                ch = decodeEscaped();			}        } // end if REGX_T_BACKSOLIDUS        else if (fState == REGX_T_POSIX_CHARCLASS_START) {            int nameEnd = XMLString::indexOf(fString, chColon, fOffset, fMemoryManager);            if (nameEnd < 0) {				ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC1, fMemoryManager);			}            bool positive = true;            if (fString[fOffset] == chCaret) {                fOffset++;                positive = false;            }			XMLCh* name = (XMLCh*) fMemoryManager->allocate            (                (nameEnd - fOffset + 1) * sizeof(XMLCh)            );//new XMLCh[(nameEnd - fOffset) + 1];			ArrayJanitor<XMLCh> janName(name, fMemoryManager);			XMLString::subString(name, fString, fOffset, nameEnd, fMemoryManager);            RangeToken* rangeTok = fTokenFactory->getRange(name, !positive);            if (rangeTok == 0) {				ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC3, fMemoryManager);            }			tok->mergeRanges(rangeTok);			end = true;			if (nameEnd+1 >= fStringLen || fString[nameEnd+1] != chCloseSquare) {				ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC1, fMemoryManager);			}			fOffset = nameEnd + 2;        }        processNext();		if (!end) {            if (fState != REGX_T_CHAR || fCharData != chDash) {                tok->addRange(ch, ch);            }            else {                processNext();                if (fState == REGX_T_EOF)                    ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, fMemoryManager);                if (fState == REGX_T_CHAR && fCharData == chCloseSquare) {                    tok->addRange(ch, ch);                    tok->addRange(chDash, chDash);                }                else {                    XMLInt32 rangeEnd = fCharData;                    if (fState == REGX_T_BACKSOLIDUS) {                        rangeEnd = decodeEscaped();                    }                    processNext();                    tok->addRange(ch, rangeEnd);                }            }        }        if (isSet(RegularExpression::SPECIAL_COMMA)            && fState == REGX_T_CHAR && fCharData == chComma) {            processNext();        }    } // end while fState	if (fState == REGX_T_EOF) {        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, fMemoryManager);	}    if (!useNRange && nRange) {        base->subtractRanges(tok);        tok = base;    }    tok->sortRanges();    tok->compactRanges();    setParseContext(S_NORMAL);    processNext();    return tok;}RangeToken* RegxParser::parseSetOperations() {    RangeToken* tok = parseCharacterClass(false);    while (fState != REGX_T_RPAREN) {		if (fState == REGX_T_CHAR            && (fCharData == chDash || fCharData == chAmpersand)            || fState == REGX_T_PLUS) {            processNext();            if (fState != REGX_T_LBRACKET)                ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Ope1, fMemoryManager);            RangeToken* tok2 = parseCharacterClass(false);            if (fState == REGX_T_PLUS) {                tok->mergeRanges(tok2);            }            else if (fCharData == chDash) {                tok->subtractRanges(tok2);            }            else if (fCharData == chAmpersand) {                tok->intersectRanges(tok2);            }            else {                throw 0; // ThrowXMLwithMemMgr(RuntimeException, "ASSERT")            }        }        else {			ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Ope2, fMemoryManager);		}    }    processNext();    return tok;}Token* RegxParser::getTokenForShorthand(const XMLInt32 ch) {    Token* tok = 0;    bool useUnicode = isSet(RegularExpression::USE_UNICODE_CATEGORY);	switch (ch) {	case chLatin_d:		tok = useUnicode ? fTokenFactory->getRange(fgUniDecimalDigit)						 : fTokenFactory->getRange(fgASCIIDigit);		break;	case chLatin_D:		tok = useUnicode ? fTokenFactory->getRange(fgUniDecimalDigit, true)						 : fTokenFactory->getRange(fgASCIIDigit, true);		break;	case chLatin_w:		tok = useUnicode ? fTokenFactory->getRange(fgUniIsWord)						 : fTokenFactory->getRange(fgASCIIWord);		break;	case chLatin_W:		tok = useUnicode ? fTokenFactory->getRange(fgUniIsWord, true)						 : fTokenFactory->getRange(fgASCIIWord, true);		break;	case chLatin_s:		tok = useUnicode ? fTokenFactory->getRange(fgUniIsSpace)						 : fTokenFactory->getRange(fgASCIISpace);		break;	case chLatin_S:		tok = useUnicode ? fTokenFactory->getRange(fgUniIsSpace, true)						 : fTokenFactory->getRange(fgASCIISpace, true);//	default://		ThrowXMLwithMemMgr(RuntimeException, "Invalid shorthand {0}", chAsString)	}    return tok;}XMLInt32 RegxParser::decodeEscaped() {    if (fState != REGX_T_BACKSOLIDUS)		ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager);    XMLInt32 ch = fCharData;	switch (ch) {	case chLatin_e:		ch = 0x1B; // Escape		break;	case chLatin_f:		ch = chFF;		break;	case chLatin_n:		ch = chLF;		break;	case chLatin_r:		ch = chCR;		break;	case chLatin_t:		ch = chHTab;		break;	case chLatin_x:		{			processNext();			if (fState != REGX_T_CHAR) {				ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);			}			if (fCharData == chOpenCurly) {				int v1 = 0;				XMLInt32 uv = 0;				do {					processNext();					if (fState != REGX_T_CHAR)						ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);					if ((v1 = hexChar(fCharData)) < 0)						break;					uv = uv*16 + v1;				} while (true);				if (fCharData != chCloseCurly)					ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape3, fMemoryManager);				if (uv > Token::UTF16_MAX)					ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape4, fMemoryManager);				ch = uv;			}			else {				int v1 = 0;				if (fState != REGX_T_CHAR || (v1 = hexChar(fCharData)) < 0)					ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);				int uv = v1;				processNext();				if (fState != REGX_T_CHAR || (v1 = hexChar(fCharData)) < 0)					ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);				ch = uv*16 + v1;			}		}		break;	case chLatin_u:		{			int v1 = 0;			int uv = 0;			for (int i=0; i< 4; i++) {				processNext();				if (fState != REGX_T_CHAR || (v1 = hexChar(fCharData)) < 0)					ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);				uv = (i == 0) ? v1 : uv*16 + v1;			}			ch = uv;		}		break;	case chLatin_v:		{			int v1 = 0;			int uv = 0;			for (int i=0; i< 6; i++) {				processNext();				if (fState != REGX_T_CHAR || (v1 = hexChar(fCharData)) < 0)					ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);				uv = (i == 0) ? v1 : uv*16 + v1;			}			if (uv > Token::UTF16_MAX)				ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape1, fMemoryManager);			ch = uv;		}		break;	case chLatin_A:	case chLatin_Z:	case chLatin_z:		ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Descape5, fMemoryManager);	} // end switch    return ch;}// ---------------------------------------------------------------------------//  RegxParser: Helper Methods// ---------------------------------------------------------------------------bool RegxParser::checkQuestion(const int off) {    return ((off < fStringLen) && fString[off] == chQuestion);}XERCES_CPP_NAMESPACE_END/**  *	End file RegxParser.cpp  */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?