📄 engine.cpp

📁 Emdros is a text database middleware-layer aimed at storage and retrieval of "text plus information
💻 CPP
📖 第 1 页 / 共 4 页
字号:
	// Will only move forward over chars already examined by a rule;	//	therefore, getChar() can't return kEndOfText, kNeedMoreInput, etc.{	for (unsigned int i = 0; i < numChars; ++i) {		if (iBufPtr == iBufEnd) {			iBuffer[iBufEnd++] = prevStage->getChar();			if (iBufEnd == iBufStart) {				++iBufStart;				if (iBufStart == iBufSize)					iBufStart = 0;			}			if (iBufEnd == iBufSize)				iBufEnd = 0;		}		iBufPtr++;		if (iBufPtr == iBufSize)			iBufPtr = 0;	}}template<class T>static const T*binary_search(const T* array, UInt32 count, UInt32 value){	while (count > 0) {		const T*	i = array;		UInt32	count2 = count / 2;		i += count2;		if (READ(*i) < value) {			array = i + 1;			count -= count2 + 1;		}		else			count = count2;	}	return array;}longPass::classMatch(UInt32 classNumber, UInt32 inChar) const{	const UInt32*	classPtr = (const UInt32*)(matchClassBase + READ(*((const UInt32*)matchClassBase + classNumber)));	UInt32			memberCount = READ(*classPtr++);	if (bInputIsUnicode) {		if (bSupplementaryChars) {			// classes are 32-bit			const UInt32*	p = binary_search(classPtr, memberCount, inChar);			if (READ(*p) == inChar)				return p - classPtr;		}		else {			// classes are 16-bit			const UInt16*	p = binary_search((const UInt16*)classPtr, memberCount, inChar);			if (READ(*p) == inChar)				return p - (const UInt16*)classPtr;		}	}	else {		// classes are 8-bit		const UInt8*	p = binary_search((const UInt8*)classPtr, memberCount, inChar);		if (READ(*p) == inChar)			return p - (const UInt8*)classPtr;	}	return -1;}UInt32Pass::repClassMember(UInt32 classNumber, UInt32 index) const{	const UInt32*	classPtr = (const UInt32*)(repClassBase + READ(*((const UInt32*)repClassBase + classNumber)));	UInt32			memberCount = READ(*classPtr++);	if (index < memberCount)		if (bOutputIsUnicode)			if (bSupplementaryChars)				return READ(classPtr[index]);			else				return READ(((const UInt16*)classPtr)[index]);		else {			return READ(((const UInt8*)classPtr)[index]);		}	else		return 0;	// this can't happen if the compiler is right!}#ifdef TRACINGstatic int _depth = 0;#endif#define RETURN(x)	do { _rval = (x); goto _return_label; } while (0)#define matchYes	1#define matchNo		0UInt32Pass::match(int index, int repeats, int textLoc){/*	attempt to match pattern starting at /index/	initial repeat count is /repeats/	text offset is /textLoc/	recurses whenever we might need to backtrack	returns		matchYes	- succeeded		matchNo		- can't match at this position		other values, eg:			kNeedMoreInput			kInvalidChar			kUnmappedChar					- aborted without a definite decision*/#ifdef TRACINGcerr << "match(" << index << ", " << repeats << ", " << textLoc << ")\n";#endif	UInt32	_rval = matchNo;	// we come back here to loop rather than recurse, with new values for the argumentsRESTART:	// if this is the first attempt to match at this index, record where we are	if (repeats == 0) {		if (index == matchElems)			matchedLength = textLoc;		if (index < infoLimit) {			info[index].matchedSpan.start = textLoc;#ifdef TRACINGcerr << "info[" << index << "].matchedSpan.start = " << textLoc << "\n";#endif		}	}	// if we're at the end of the pattern, we have a match	if (index >= patternLength)		RETURN(matchYes);	if (index == 0 && repeats == 0)		sgrStack = 0;	// ensure this is cleared at start of pattern (shouldn't be necessary?)	{	// gcc complains about jumping past initializers (from RETURN above) without this		UInt32				mr;		const MatchElem&	m = pattern[index];		int					repeatMin = READ(m.flags.repeat) >> 4;		int					repeatMax = READ(m.flags.repeat) & 0x0f;		UInt8				type      = READ(m.flags.type);		bool				negate    = ((type & kMatchElem_Negate) != 0);		type = ((type & kMatchElem_NonLit) != 0)			? type & kMatchElem_TypeMask			: 0;		int		classIndex;		bool	matches;		UInt32	inChar;				// start of group: try each alternative in turn		if (type == kMatchElem_Type_BGroup) {			// try matching one of the alternatives in the group (again)			info[index].groupRepeats = repeats;			if (repeats < repeatMax) {				int	altIndex = index;				while (true) {					mr = match(altIndex + 1, 0, textLoc);					if (mr != matchNo)						RETURN(mr);					// failed, so step ahead to next alternative or end of group					altIndex += READ(pattern[altIndex].value.bgroup.dNext);					if ((READ(pattern[altIndex].flags.type) & kMatchElem_TypeMask) != kMatchElem_Type_OR)						break;				}			}			// if the group has matched enough times...			if (repeats >= repeatMin) {				// try to match following stuff#ifdef TRACINGcerr << "repeats >= repeatMin\n";#endif				mr = match(index + READ(m.value.bgroup.dAfter), 0, textLoc);				if (mr == matchYes) {					if (index < infoLimit) {						info[index].matchedSpan.limit = textLoc;#ifdef TRACINGcerr << "group returning matchYes; info[" << index << "].matchedSpan.limit = " << textLoc << "\n";#endif						// don't allow elements within the group to indicate matches beyond the span of the group itself						for (int i = index + READ(m.value.bgroup.dAfter) - 1; i > index; --i)							if (i < infoLimit) {								if (info[i].matchedSpan.start > textLoc)									info[i].matchedSpan.start = textLoc;								if (info[i].matchedSpan.limit > textLoc)									info[i].matchedSpan.limit = textLoc;							}					}				}				RETURN(mr);			}			// otherwise just backtrack			RETURN(matchNo);		}				// reached end of an alternative		else if (type == kMatchElem_Type_OR || type == kMatchElem_Type_EGroup) {			int	startIndex = index - READ(m.value.egroup.dStart);			mr = match(startIndex, info[startIndex].groupRepeats + 1, textLoc);			RETURN(mr);		}				// not a group, so we loop rather than recurse until optionality strikes		else {			// ensure that item matches at least repeatMin times			while (repeats < repeatMin) {				inChar = inputChar(textLoc);				if (inChar == kInvalidChar || inChar == kNeedMoreInput || inChar == kUnmappedChar)					RETURN(inChar);				matches = false;				switch (type) {					case 0:	// literal						matches = (READ(m.value.usv.data) & kUSVMask) == inChar;						break;										case kMatchElem_Type_Class:						classIndex = classMatch(READ(m.value.cls.index), inChar);						matches = (classIndex != -1);						if (matches && repeats == 0 && index < infoLimit)							info[index].classIndex = classIndex;						break;										case kMatchElem_Type_ANY:						matches = (inChar != kEndOfText);						break;										case kMatchElem_Type_EOS:						matches = (inChar == kEndOfText);						break;				}				matches = (matches != negate);				if (!matches)					RETURN(matchNo);				++repeats;				textLoc += direction;			}						if (index < infoLimit) {				info[index].matchedSpan.limit = textLoc;#ifdef TRACINGcerr << "info[" << index << "].matchedSpan.limit = " << textLoc << "\n";#endif			}			if (repeatMin == repeatMax) {				// no need to recurse, as no optionality				++index;				repeats = 0;				goto RESTART;			}						// try for another repeat if allowed			if (repeats < repeatMax) {				inChar = inputChar(textLoc);				if (inChar == kInvalidChar || inChar == kNeedMoreInput || inChar == kUnmappedChar)					RETURN(inChar);				matches = false;				switch (type) {					case 0:	// literal						matches = (READ(m.value.usv.data) & kUSVMask) == inChar;						break;										case kMatchElem_Type_Class:						classIndex = classMatch(READ(m.value.cls.index), inChar);						matches = (classIndex != -1);						if (matches && repeats == 0 && index < infoLimit)							info[index].classIndex = classIndex;						break;										case kMatchElem_Type_ANY:						matches = (inChar != kEndOfText);						break;										case kMatchElem_Type_EOS:						matches = (inChar == kEndOfText);						break;				}				matches = (matches != negate);				if (matches) {					mr = match(index, repeats + 1, textLoc + direction);					if (mr != matchNo)						RETURN(mr);				}			}						// otherwise try to match the remainder of the pattern			mr = match(index + 1, 0, textLoc);			RETURN(mr);		}	}_return_label:	if (_rval == matchNo)		if (index < infoLimit) {			info[index].matchedSpan.limit = textLoc;#ifdef TRACINGcerr << "rval == matchNo; setting info[" << index << "].matchedSpan.limit = " << textLoc << "\n";#endif		}#ifdef TRACINGcerr << "RETURN(" << (_rval == matchYes ? "matchYes" : "matchNo") << ")\n";#endif    return _rval;}#undef RETURN#ifdef TRACINGstatic voidprintMatchElem(const MatchElem& m){	string	rval;	char	buf[20];	if (m.flags.type & kMatchElem_Negate)		rval += "!";	if (m.flags.type & kMatchElem_NonLit) {		switch (m.flags.type & kMatchElem_TypeMask) {			case kMatchElem_Type_Class:				sprintf(buf, "[%d]", m.value.cls.index);				rval += buf;				break;			case kMatchElem_Type_BGroup:				rval += "(";				break;			case kMatchElem_Type_EGroup:				rval += ")";				break;			case kMatchElem_Type_OR:				rval += "|";				break;			case kMatchElem_Type_ANY:				rval += ".";				break;			case kMatchElem_Type_EOS:				rval += "#";				break;			case kMatchElem_Type_Copy:				rval += "@";				break;		}	}	else {		UInt32	v = m.value.usv.data & kUSVMask;		if (v >= ' ' && v < 0x7e) {			sprintf(buf, "'%c'", (char)v);			rval += buf;		}		else {			sprintf(buf, "0x%04X", (UInt32)v);			rval += buf;		}	}	if (!(m.flags.type & kMatchElem_NonLit) || (m.flags.type & kMatchElem_TypeMask) != kMatchElem_Type_BGroup)		switch (m.flags.repeat) {			case 0x01:				rval += "?";				break;			case 0x11:				break;			case 0x0F:				rval += "*";				break;			case 0x1F:				rval += "+";				break;			default:				sprintf(buf, "{%d,%d}", m.flags.repeat >> 4, m.flags.repeat & 0x0F);				rval += buf;				break;		}	cerr << rval;}static voidprintMatch(const StringRule* rule){	for (int i = 0; i < READ(rule->matchLength); ++i) {		cerr << " ";		printMatchElem(((MatchElem*)(rule + 1))[i]);//		cerr << "<" << i << ">";	}	if (READ(rule->preLength) > 0 || READ(rule->postLength) > 0) {		cerr << " /";		for (int i = READ(rule->preLength) - 1; i >= 0; --i) {			cerr << " ";			printMatchElem(((MatchElem*)(rule + 1))[READ(rule->matchLength) + READ(rule->postLength) + i]);		}		cerr << " _";		for (int i = 0; i < READ(rule->postLength); ++i) {			cerr << " ";			printMatchElem(((MatchElem*)(rule + 1))[READ(rule->matchLength) + i]);		}	}}static voidprintRep(const StringRule* rule){	const RepElem*	r = (const RepElem*)((const MatchElem*)(rule + 1) + rule->matchLength + rule->preLength + rule->postLength);	for (int i = 0; i < READ(rule->repLength); ++i, ++r) {		cerr << " ";		switch (READ(r->flags.type)) {			case kRepElem_Literal:				{					UInt32	v;					char	buf[20];					v = READ(r->value);					if (v >= ' ' && v <= 0x7e) {						sprintf(buf, "'%c'", v);						cerr << buf;					}					else {						sprintf(buf, "0x%04X", v);						cerr << buf;					}				}				break;			case kRepElem_Class:				cerr << "[" << (int)READ(r->flags.repClass) << "," << (int)READ(r->flags.matchIndex) << "]";				break;			case kRepElem_Copy:				cerr << "@" << (int)READ(r->flags.matchIndex);				break;			case kRepElem_Unmapped:				cerr << "?";				break;		}	}}#endifUInt32Pass::DoMapping(){	UInt32	inChar = inputChar(0);	if (inChar == kNeedMoreInput || inChar == kInvalidChar || inChar == kUnmappedChar)		return inChar;	if (inChar == kEndOfText) {		outputChar(kEndOfText);		return inChar;	}	matchedLength = 1;	const Lookup*	lookup;	if (bInputIsUnicode) {		// Unicode lookup		UInt16	charIndex = 0;		if ((const UInt8*)lookupBase == pageBase) {			// leave charIndex == 0 : pass with no rules		}		else {			UInt8	plane = inChar >> 16;			const UInt8*	pageMap = 0;			if (bSupplementaryChars) {				if ((plane < 17) && (READ(planeMap[plane]) != 0xff)) {					pageMap = (const UInt8*)(pageBase + 256 * READ(planeMap[plane]));					goto GOT_PAGE_MAP;				}			}			else if (plane == 0) {				pageMap = pageBase;			GOT_PAGE_MAP:				UInt8	page = (inChar >> 8) & 0xff;				if (READ(pageMap[page]) != 0xff) {					const UInt16*	charMapBase = (const UInt16*)(pageBase + 256 * numPageMaps);					const UInt16*	charMap = charMapBase + 256 * READ(pageMap[page]);					charIndex = READ(charMap[inChar & 0xff]);				}			}		}		lookup = lookupBase + charIndex;	}	else {		// byte-oriented lookup		if (pageBase != (const Byte*)tableHeader) {			// dbcsPage present			long	pageNumber = READ(pageBase[inChar]);			if (pageNumber == 0)				// not a valid DBCS lead byte				lookup = lookupBase + inChar;			else {				UInt32	nextChar = inputChar(1);				if (nextChar == kNeedMoreInput || nextChar == kInvalidChar || nextChar == kUnmappedChar)					return nextChar;				if (nextChar == kEndOfText)					lookup = lookupBase + inChar;				else {					lookup = lookupBase + pageNumber * 256 + nextChar;					if (READ(lookup->rules.type) == kLookupType_IllegalDBCS)						// illegal DBCS sequence; map lead byte alone						lookup = lookupBase + inChar;					else						matchedLength = 2;				}			}		}		else			// single-byte only			lookup = lookupBase + inChar;	}	UInt8	ruleType = READ(lookup->rules.type);	if (ruleType == kLookupType_StringRules || (ruleType & kLookupType_RuleTypeMask) == kLookupType_ExtStringRules) {		// process string rule list		const UInt32*	ruleList = (const UInt32*)stringListBase + READ(lookup->rules.ruleIndex);		bool			matched = false;		bool			allowInsertion = true;		int ruleCount = READ(lookup->rules.ruleCount);		if ((ruleType & kLookupType_RuleTypeMask) == kLookupType_ExtStringRules)			ruleCount += 256 * (ruleType & kLookupType_ExtRuleCountMask);		for ( ; ruleCount > 0; --ruleCount) {			const StringRule*	rule = (const StringRule*)(stringRuleData + READ(*ruleList));#ifdef TRACINGif (traceLevel > 0) {	cerr << "** trying match: ";	printMatch(rule);	cerr << "\n";}#endif			ruleList++;			matchElems = READ(rule->matchLength);			if (matchElems == 0 && allowInsertion == false)				continue;			patternLength = matchElems + READ(rule->postLength);			pattern = (MatchElem*)(rule + 1);	// point past the defined struct for the rule header			direction = 1;			infoLimit = matchElems;			// clear junk...			for (int i = 0; i < infoLimit; ++i)				info[i].matchedSpan.start = info[i].matchedSpan.limit = 0;                        			UInt32	mr = match(0, 0, 0);			if (mr == matchYes) {				if (matchedLength == 0 && allowInsertion == false)					continue;				pattern += patternLength;				patternLength = READ(rule->preLength);				if (patternLength > 0) {					direction = -1;					infoLimit = 0;					matchElems = -1;					mr = match(0, 0, -1);
💿 文件大小 7978 K
👤 上传用户 oujk123
📂 所属分类其他行业
🏷️ 相关标签

#text #middleware-layer #information #retrieval
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -