regularexpression.cpp

来自「IBM的解析xml的工具Xerces的源代码」· C++ 代码 · 共 1,678 行 · 第 1/4 页

CPP
1,678
字号
		if (getWordType(context->fString, context->fStart, context->fLimit,						offset) != WT_LETTER			|| getPreviousWordType(context->fString, context->fStart,								   context->fLimit, offset) != WT_OTHER)			return false;		break;	case chCloseAngle:		if (context->fLength == 0 || offset == context->fStart)			return false;		if (getWordType(context->fString, context->fStart, context->fLimit,						offset) != WT_OTHER			|| getPreviousWordType(context->fString, context->fStart,								   context->fLimit, offset) != WT_LETTER)			return false;		break;	}	return true;}bool RegularExpression::matchBackReference(Context* const context,										   const XMLInt32 refNo, int& offset,										   const short direction,										   const bool ignoreCase){	if (refNo <=0 || refNo >= fNoGroups)		ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_BadRefNo, fMemoryManager);	if (context->fMatch->getStartPos(refNo) < 0		|| context->fMatch->getEndPos(refNo) < 0)		return false;	int start = context->fMatch->getStartPos(refNo);	int length = context->fMatch->getEndPos(refNo) - start;	int tmpOffset = (direction > 0) ? offset : offset - length;	if (context->fLimit - tmpOffset < length)		return false;	bool match = ignoreCase					? XMLString::regionIMatches(context->fString,tmpOffset,												context->fString,start,length)					: XMLString::regionMatches(context->fString, tmpOffset,											   context->fString, start,length);	if (!match)		return false;	offset = (direction > 0) ? offset + length : offset - length;	return true;}bool RegularExpression::matchString(Context* const context,									const XMLCh* const literal, int& offset,									const short direction, const bool ignoreCase){	int length = XMLString::stringLen(literal);	int tmpOffset = (direction > 0) ? offset : offset - length;	if (context->fLimit - tmpOffset < length)		return false;	bool match = ignoreCase					? XMLString::regionIMatches(context->fString, tmpOffset,												literal, 0, length)					: XMLString::regionMatches(context->fString, tmpOffset,											   literal, 0, length);	if (match) {	    offset = direction > 0 ? offset + length : offset - length;    }	return match;}int RegularExpression::matchCapture(Context* const context, const Op* const op,                                    int offset, const short direction){	// No check is made for nullness of fMatch as the function is only called if	// fMatch is not null.	XMLInt32 index = op->getData();	int save = (index > 0) ? context->fMatch->getStartPos(index)                           : context->fMatch->getEndPos(-index);	if (index > 0) {		context->fMatch->setStartPos(index, offset);		int ret = match(context, op->getNextOp(), offset, direction);		if (ret < 0)			context->fMatch->setStartPos(index, save);		return ret;	}		context->fMatch->setEndPos(-index, offset);	int ret = match(context, op->getNextOp(), offset, direction);	if (ret < 0)		context->fMatch->setEndPos(-index, save);	return ret;}bool RegularExpression::matchCondition(Context* const context,                                              const Op* const op, int offset,                                              const short direction){	int refNo = op->getRefNo();	if ( refNo > 0)		return (context->fMatch->getStartPos(refNo) >= 0                && context->fMatch->getEndPos(refNo) >= 0);	return (0 <= match(context, op->getConditionFlow(), offset, direction));}int RegularExpression::parseOptions(const XMLCh* const options){	if (options == 0)		return 0;	int opts = 0;	int length = XMLString::stringLen(options);	for (int i=0; i < length; i++) {			int v = getOptionValue(options[i]);		if (v == 0)			ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Regex_UnknownOption, options, fMemoryManager);		opts |= v;	}	return opts;}void RegularExpression::compile(const Token* const token) {	if (fOperations != 0)		return;	fNoClosures = 0;	fOperations = compile(token, 0, false);}Op* RegularExpression::compile(const Token* const token, Op* const next,							   const bool reverse) {	Op* ret = 0;	const unsigned short tokenType = token->getTokenType();	switch(tokenType) {	case Token::T_DOT:	case Token::T_CHAR:	case Token::T_ANCHOR:	case Token::T_RANGE:	case Token::T_NRANGE:	case Token::T_STRING:	case Token::T_BACKREFERENCE:	case Token::T_EMPTY:		ret = compileSingle(token, next, tokenType);		break;	case Token::T_CONCAT:		ret = compileConcat(token, next, reverse);		break;	case Token::T_UNION:		ret = compileUnion(token, next, reverse);		break;	case Token::T_CLOSURE:	case Token::T_NONGREEDYCLOSURE:		ret = compileClosure(token, next, reverse, tokenType);		break;	case Token::T_PAREN:		ret = compileParenthesis(token, next, reverse);		break;	case Token::T_LOOKAHEAD:	case Token::T_NEGATIVELOOKAHEAD:		ret = compileLook(token, next, false, tokenType);		break;	case Token::T_LOOKBEHIND:	case Token::T_NEGATIVELOOKBEHIND:		ret = compileLook(token, next, true, tokenType);		break;	case Token::T_INDEPENDENT:	case Token::T_MODIFIERGROUP:		ret = compileLook(token, next, reverse, tokenType);		break;	case Token::T_CONDITION:		ret = compileCondition(token, next, reverse);		break;	default:		ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_UnknownTokenType, fMemoryManager);		break; // this line to be deleted	}	return ret;}/* * Helper for Replace. This method prepares the replacement string by substituting * in actual values for parenthesized sub expressions.  * * An error will be thrown if: *  1) repString references an undefined subExpression *  2) there is an unescaped chDollar which is not followed by a digit * */const XMLCh* RegularExpression::subInExp(const XMLCh* const repString,                                          const XMLCh* const origString,                                          const Match* subEx){  int numSubExp = subEx->getNoGroups() - 1;  if (numSubExp == 0)    return XMLString::replicate(repString, fMemoryManager);    bool notEscaped = true;                     XMLBuffer newString(1023, fMemoryManager);                       XMLCh indexStr[2]; //holds the string rep of a   indexStr[1] = chNull;  int index = -1;  for (const XMLCh* ptr = repString; *ptr != chNull; ptr++){    if ((*ptr == chDollarSign) && notEscaped) {            ptr++;            //check that after the $ is a digit       if (!XMLString::isDigit(*ptr)){               //invalid replace string - $ must be followed by a digit				ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, fMemoryManager);      }              indexStr[0] = *ptr;                     //get the digit       index = XMLString::parseInt(indexStr, fMemoryManager);  //convert it to an int      //now check that the index is legal      if (index > numSubExp){				ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, fMemoryManager);      }              int start = subEx->getStartPos(index);      int end = subEx->getEndPos(index);      //now copy the substring into the new string      for (int i=start; i<end; i++){        newString.append(origString[i]);      }              } else {       //if you have a slash and then a character that's not a $ or /,       //then it's an invalid replace string        if (!notEscaped && (*ptr != chDollarSign && *ptr != chBackSlash)){				ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, fMemoryManager);      }            if (*ptr == chBackSlash){        notEscaped = false;        continue;              }else           notEscaped = true;        newString.append(*ptr);    }  }  return XMLString::replicate(newString.getRawBuffer(), fMemoryManager);       }/* * Prepares for matching. This method is called just before starting matching */void RegularExpression::prepare() {	XMLMutexLock lockInit(&fMutex);	compile(fTokenTree);	fMinLength = fTokenTree->getMinLength();	fFirstChar = 0;	if (!isSet(fOptions, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) &&		!isSet(fOptions, XMLSCHEMA_MODE))							{		RangeToken* rangeTok = fTokenFactory->createRange();		int result = fTokenTree->analyzeFirstCharacter(rangeTok, fOptions, fTokenFactory);		if (result == Token::FC_TERMINAL) {			rangeTok->compactRanges();			fFirstChar = rangeTok;		}	}	if (fOperations != 0 && fOperations->getNextOp() == 0 &&		(fOperations->getOpType() == Op::O_STRING ||		 fOperations->getOpType() == Op::O_CHAR) )			 {		fFixedStringOnly = true;		if (fOperations->getOpType() == Op::O_STRING) {			fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;			fFixedString = XMLString::replicate(fOperations->getLiteral(), fMemoryManager);		}		else{						XMLInt32 ch = fOperations->getData();			if ( ch >= 0x10000) { // add as constant				fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;				fFixedString = RegxUtil::decomposeToSurrogates(ch, fMemoryManager);			}			else {				XMLCh* dummyStr = (XMLCh*) fMemoryManager->allocate(2 * sizeof(XMLCh));//new XMLCh[2];				dummyStr[0] = (XMLCh) fOperations->getData();				dummyStr[1] = chNull;				fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;				fFixedString = dummyStr;			}		}		fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256,								  isSet(fOptions, IGNORE_CASE), fMemoryManager);	}	else if (!isSet(fOptions, XMLSCHEMA_MODE) &&			 !isSet(fOptions, PROHIBIT_FIXED_STRING_OPTIMIZATION)) {		int fixedOpts = 0;		Token* tok = fTokenTree->findFixedString(fOptions, fixedOpts);		fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;		fFixedString = (tok == 0) ? 0			: XMLString::replicate(tok->getString(), fMemoryManager);		if (fFixedString != 0 && XMLString::stringLen(fFixedString) < 2) {			fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;			fFixedString = 0;		}				if (fFixedString != 0) {			fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256,									   isSet(fixedOpts, IGNORE_CASE));		}	}}unsigned short RegularExpression::getCharType(const XMLCh ch) {    if (!isSet(fOptions, UNICODE_WORD_BOUNDARY)) {		if (isSet(fOptions, USE_UNICODE_CATEGORY)) {			if (fWordRange == 0) {				fWordRange = fTokenFactory->getRange(fgUniIsWord);				if (fWordRange == 0)					ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Regex_RangeTokenGetError, fgUniIsWord, fMemoryManager);			}			return fWordRange->match(ch) ? WT_LETTER : WT_OTHER;		}		return RegxUtil::isWordChar(ch);    }	switch (XMLUniCharacter::getType(ch)) {	case XMLUniCharacter::UPPERCASE_LETTER:	case XMLUniCharacter::LOWERCASE_LETTER:	case XMLUniCharacter::TITLECASE_LETTER:	case XMLUniCharacter::MODIFIER_LETTER:	case XMLUniCharacter::OTHER_LETTER:	case XMLUniCharacter::LETTER_NUMBER:	case XMLUniCharacter::DECIMAL_DIGIT_NUMBER:	case XMLUniCharacter::OTHER_NUMBER:	case XMLUniCharacter::COMBINING_SPACING_MARK:		return WT_LETTER;	case XMLUniCharacter::FORMAT:	case XMLUniCharacter::NON_SPACING_MARK:	case XMLUniCharacter::ENCLOSING_MARK:		return WT_IGNORE;	case XMLUniCharacter::CONTROL:		switch (ch) {		case chHTab:		case chLF:		case chVTab:		case chFF:		case chCR:			return WT_OTHER;		default:			return WT_IGNORE;		}	}    return WT_OTHER;}XERCES_CPP_NAMESPACE_END/**  *	End of file RegularExpression.cpp  */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?