regularexpression.cpp

来自「IBM的解析xml的工具Xerces的源代码」· C++ 代码 · 共 1,678 行 · 第 1/4 页

CPP
1,678
字号
// ---------------------------------------------------------------------------XMLCh* RegularExpression::replace(const XMLCh* const matchString,                                   const XMLCh* const replaceString){  return replace(matchString, replaceString, 0,                  XMLString::stringLen(matchString));}XMLCh* RegularExpression::replace(const XMLCh* const matchString,                                    const XMLCh* const replaceString,                                  const int start, const int end){  //check if matches zero length string - throw error if so  if (matches(XMLUni::fgZeroLenString, fMemoryManager)){		ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString, fMemoryManager);  }        RefVectorOf<Match> *subEx = new (fMemoryManager) RefVectorOf<Match>(10, true, fMemoryManager);	  Janitor<RefVectorOf<Match> > janSubEx(subEx);  //Call to tokenize with Match vector so that we keep track of the locations  //of the subExpression within each of the matches  RefArrayVectorOf<XMLCh>* tokenStack = tokenize(matchString, start, end, subEx);	  Janitor<RefArrayVectorOf<XMLCh> > janTokStack(tokenStack);      XMLBuffer result(1023, fMemoryManager);    int numSubEx = 0;    if (subEx && subEx->size() > 0)    numSubEx = subEx->elementAt(0)->getNoGroups() - 1;    int tokStackSize = tokenStack->size();  const XMLCh* curRepString = XMLString::replicate(replaceString, fMemoryManager);      for (int i = 0; i < tokStackSize; i++){          result.append(tokenStack->elementAt(i));      if (i != tokStackSize - 1) {             //if there are subExpressions, then determine the string we want to       //substitute in.        if (numSubEx != 0) {            fMemoryManager->deallocate((XMLCh*)curRepString);            curRepString = subInExp(replaceString, matchString, subEx->elementAt(i));             }      result.append(curRepString);    }  }        fMemoryManager->deallocate((XMLCh*)curRepString);  return XMLString::replicate(result.getRawBuffer(), fMemoryManager);     }// ---------------------------------------------------------------------------//  RegularExpression: Helpers methods// ---------------------------------------------------------------------------int RegularExpression::getOptionValue(const XMLCh ch) {	int ret = 0;	switch (ch) {		case chLatin_i:			ret = IGNORE_CASE;			break;		case chLatin_m:			ret = MULTIPLE_LINE;			break;		case chLatin_s:			ret = SINGLE_LINE;			break;		case chLatin_x:			ret = EXTENDED_COMMENT;			break;		case chLatin_u:			ret = USE_UNICODE_CATEGORY;			break;		case chLatin_w:			ret = UNICODE_WORD_BOUNDARY;			break;		case chLatin_F:			ret = PROHIBIT_FIXED_STRING_OPTIMIZATION;			break;		case chLatin_H:			ret = PROHIBIT_HEAD_CHARACTER_OPTIMIZATION;			break;		case chLatin_X:			ret = XMLSCHEMA_MODE;			break;		case chComma:			ret = SPECIAL_COMMA;			break;		default:			break;	}	return ret;}int RegularExpression::match(Context* const context, const Op* const operations							 , int offset, const short direction){	const Op* tmpOp = operations;	bool ignoreCase = isSet(fOptions, IGNORE_CASE);	while (true) {		if (tmpOp == 0)			break;		if (offset > context->fLimit || offset < context->fStart)			return -1;		switch(tmpOp->getOpType()) {		case Op::O_CHAR:			if (!matchChar(context, tmpOp->getData(), offset, direction,						   ignoreCase))				return -1;			tmpOp = tmpOp->getNextOp();			break;		case Op::O_DOT:			if (!matchDot(context, offset, direction))				return -1;			tmpOp = tmpOp->getNextOp();			break;		case Op::O_RANGE:		case Op::O_NRANGE:			if (!matchRange(context, tmpOp, offset, direction, ignoreCase))				return -1;			tmpOp = tmpOp->getNextOp();			break;		case Op::O_ANCHOR:			if (!matchAnchor(context, tmpOp->getData(), offset))				return -1;			tmpOp = tmpOp->getNextOp();			break;		case Op::O_BACKREFERENCE:			if (!matchBackReference(context, tmpOp->getData(), offset,									direction, ignoreCase))				return -1;			tmpOp = tmpOp->getNextOp();			break;		case Op::O_STRING:			if (!matchString(context, tmpOp->getLiteral(), offset, direction,							 ignoreCase))				return -1;			tmpOp = tmpOp->getNextOp();			break;		case Op::O_CLOSURE:			{				XMLInt32 id = tmpOp->getData();				if (id >= 0) {					int prevOffset = context->fOffsets[id];					if (prevOffset < 0 || prevOffset != offset) {						context->fOffsets[id] = offset;					}					else {						context->fOffsets[id] = -1;						tmpOp = tmpOp->getNextOp();						break;					}				}				int ret = match(context, tmpOp->getChild(), offset, direction);				if (id >= 0) {					context->fOffsets[id] = -1;				}				if (ret >= 0)					return ret;				tmpOp = tmpOp->getNextOp();			}			break;		case Op::O_QUESTION:			{				int ret = match(context, tmpOp->getChild(), offset, direction);				if (ret >= 0)					return ret;				tmpOp = tmpOp->getNextOp();			}			break;		case Op::O_NONGREEDYCLOSURE:		case Op::O_NONGREEDYQUESTION:			{				int ret = match(context,tmpOp->getNextOp(),offset,direction);				if (ret >= 0)					return ret;				tmpOp = tmpOp->getChild();			}			break;		case Op::O_UNION:			{				return matchUnion(context, tmpOp, offset, direction);			}		case Op::O_CAPTURE:			if (context->fMatch != 0 && tmpOp->getData() != 0)				return matchCapture(context, tmpOp, offset, direction);			tmpOp = tmpOp->getNextOp();			break;		case Op::O_LOOKAHEAD:			if (0 > match(context, tmpOp->getChild(), offset, 1))				return -1;			tmpOp = tmpOp->getNextOp();			break;		case Op::O_NEGATIVELOOKAHEAD:			if (0 <= match(context, tmpOp->getChild(), offset, 1))				return -1;			tmpOp = tmpOp->getNextOp();			break;		case Op::O_LOOKBEHIND:			if (0 > match(context, tmpOp->getChild(), offset, -1))				return - 1;			tmpOp = tmpOp->getNextOp();			break;		case Op::O_NEGATIVELOOKBEHIND:			if (0 <= match(context, tmpOp->getChild(), offset, -1))				return -1;			tmpOp = tmpOp->getNextOp();			break;		case Op::O_INDEPENDENT:        case Op::O_MODIFIER:			{				int ret = (tmpOp->getOpType() == Op::O_INDEPENDENT)					   ? match(context, tmpOp->getChild(), offset, direction)                       : matchModifier(context, tmpOp, offset, direction);                if (ret < 0)                    return ret;				offset = ret;				tmpOp = tmpOp->getNextOp();			}			break;		case Op::O_CONDITION:			if (tmpOp->getRefNo() >= fNoGroups)				return -1;			if (matchCondition(context, tmpOp, offset, direction))				tmpOp = tmpOp->getYesFlow();			else				if (tmpOp->getNoFlow() != 0)                    tmpOp = tmpOp->getNoFlow();                else                    tmpOp = tmpOp->getNextOp();			break;		}	}		return offset;}bool RegularExpression::matchChar(Context* const context,								  const XMLInt32 ch, int& offset,								  const short direction, const bool ignoreCase){	int tmpOffset = direction > 0 ? offset : offset - 1;	if (tmpOffset >= context->fLimit || tmpOffset < 0)		return false;	XMLInt32 strCh = 0;		if (!context->nextCh(strCh, tmpOffset, direction))		return false;	bool match = ignoreCase ? matchIgnoreCase(ch, strCh)		                    : (ch == strCh);	if (!match)		return false;	offset = (direction > 0) ? ++tmpOffset : tmpOffset;	return true;}bool RegularExpression::matchDot(Context* const context, int& offset,								 const short direction){	int tmpOffset = direction > 0 ? offset : offset - 1;	if (tmpOffset >= context->fLimit || tmpOffset < 0)		return false;	XMLInt32 strCh = 0;		if (!context->nextCh(strCh, tmpOffset, direction))		return false;	if (!isSet(fOptions, SINGLE_LINE)) {		if (direction > 0 && RegxUtil::isEOLChar(strCh))			return false;		if (direction <= 0 && !RegxUtil::isEOLChar(strCh) )			return false;	}    offset = (direction > 0) ? ++tmpOffset : tmpOffset;	return true;}bool RegularExpression::matchRange(Context* const context, const Op* const op,								   int& offset, const short direction,								   const bool ignoreCase){	int tmpOffset = direction > 0 ? offset : offset - 1;	if (tmpOffset >= context->fLimit || tmpOffset < 0)		return false;	XMLInt32 strCh = 0;		if (!context->nextCh(strCh, tmpOffset, direction))		return false;	RangeToken* tok = (RangeToken *) op->getToken();	bool match = false;	if (ignoreCase) {		//REVISIT we should match ignoring case, but for now		//we will do a normal match		//tok = tok->getCaseInsensitiveToken();		//if (!token->match(strCh)) {		//	if (strCh > 0x10000)		//		return -1;			// Do case insensitive matching - uppercase match			// or lowercase match		//}		match = tok->match(strCh);	}	else		match = tok->match(strCh);	if (!match)		return false;	offset = (direction > 0) ? ++tmpOffset : tmpOffset;	return true;}bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch,									const int offset){	switch ((XMLCh) ch) {	case chLatin_A:		if (offset != context->fStart)			return false;		break;	case chLatin_B:		if (context->fLength == 0)			break;		{			int after = getWordType(context->fString, context->fStart,									context->fLimit, offset);			if (after == WT_IGNORE				|| after == getPreviousWordType(context->fString,												context->fStart,												context->fLimit, offset))				break;		}		return false;	case chLatin_b:		if (context->fLength == 0)			return false;		{			int after = getWordType(context->fString, context->fStart,									context->fLimit, offset);			if (after == WT_IGNORE				|| after == getPreviousWordType(context->fString,												context->fStart												, context->fLimit, offset))				return false;		}		break;	case chLatin_Z:	case chDollarSign:		if ( (XMLCh) ch == chDollarSign && isSet(fOptions, MULTIPLE_LINE)) {			if (!(offset == context->fLimit || (offset < context->fLimit				&& RegxUtil::isEOLChar(context->fString[offset]))))				return false;		}		else {			if (!(offset == context->fLimit				|| (offset+1 == context->fLimit				    && RegxUtil::isEOLChar(context->fString[offset]))				|| (offset+2 == context->fLimit				    && context->fString[offset] == chCR					&& context->fString[offset+1] == chLF)))				return false;		}		break;	case chLatin_z:		if (offset != context->fLimit)			return false;		break;	case chAt:	case chCaret:		if ( (XMLCh) ch == chCaret && !isSet(fOptions, MULTIPLE_LINE)) {			if (offset != context->fStart)				return false;		}		else {			if (!(offset == context->fStart || (offset > context->fStart				      && RegxUtil::isEOLChar(context->fString[offset-1]))))				return false;		}		break;	case chOpenAngle:		if (context->fLength == 0 || offset == context->fLimit)			return false;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?