📄 engine.cpp

📁 Emdros is a text database middleware-layer aimed at storage and retrieval of "text plus information
💻 CPP
📖 第 1 页 / 共 4 页
字号:
				}				if (mr == matchYes) {					// RULE MATCHED! execute it#ifdef TRACINGif (traceLevel > 0) {	cerr << "** MATCHED:";	printMatch(rule);	cerr << "\n";		cerr << "** RANGES:";	for (int i = 0; i < READ(rule->matchLength); ++i) {		cerr << " <" << info[i].matchedSpan.start << ":" << info[i].matchedSpan.limit << ">";	}	cerr << "\n";		cerr << "** REPLACEMENT:";	printRep(rule);	cerr << "\n";	cerr << "** GENERATES:";}#endif					const RepElem*	r = (const RepElem*)(pattern + patternLength);					for (int i = 0; i < READ(rule->repLength); ++i, ++r) {#ifdef TRACINGif (traceLevel > 0)	cerr << " <";#endif						switch (READ(r->flags.type)) {							case kRepElem_Literal:								outputChar(READ(r->value));#ifdef TRACINGif (traceLevel > 0)	cerr << (int)READ(r->value);#endif								break;							case kRepElem_Class:								{									const MatchInfo&	myInfo = info[READ(r->flags.matchIndex)];									if (myInfo.matchedSpan.start < myInfo.matchedSpan.limit) {										outputChar(repClassMember(READ(r->flags.repClass), myInfo.classIndex));#ifdef TRACINGif (traceLevel > 0)	cerr << (int)repClassMember(READ(r->flags.repClass), myInfo.classIndex);#endif									}								}								break;							case kRepElem_Copy:								{									const MatchInfo*	myInfo = &info[READ(r->flags.matchIndex)];									for (int i = myInfo->matchedSpan.start; i < myInfo->matchedSpan.limit; ++i) {										outputChar(inputChar(i));#ifdef TRACINGif (traceLevel > 0)	cerr << (i > myInfo->matchedSpan.start ? "," : "") << (int)inputChar(i);#endif									}								}								break;							case kRepElem_Unmapped:								if (bOutputIsUnicode == bInputIsUnicode) {									outputChar(inChar);#ifdef TRACINGif (traceLevel > 0)	cerr << (int)inChar;#endif								}								else {									switch (converter->unmappedBehavior) {										case kOptionsUnmapped_DontUseReplacementChar:											return kUnmappedChar;										case kOptionsUnmapped_UseReplacementCharWithWarning:											converter->warningStatus |= kStatus_UsedReplacement;											// fall through										default:	// case kOptionsUnmapped_UseReplacementCharSilently:											outputChar(READ(tableHeader->replacementChar));											break;									}#ifdef TRACINGif (traceLevel > 0)	cerr << (int)READ(tableHeader->replacementChar);#endif								}								break;						}#ifdef TRACINGif (traceLevel > 0)	cerr << ">";#endif					}#ifdef TRACINGif (traceLevel > 0)	cerr << endl;#endif					if (matchedLength > 0) {						// we've matched the current input character, so break the loop						matched = true;						break;					}					else {						// must have been an insertion (or null!) rule, so skip any further insertion rules						allowInsertion = false;					}				}				else if (mr != matchNo) {					return mr;				}			}			else if (mr != matchNo) {				return mr;			}		}		if (!matched) {			// no rule matched the current input char, so we simulate a default "Unmapped" lookup			if (bOutputIsUnicode == bInputIsUnicode)				// B->B or U->U simply copies the input to the output				outputChar(inChar);			else {				// B->U or U->B uses the replacement char or fails, depending on options				switch (converter->unmappedBehavior) {					case kOptionsUnmapped_DontUseReplacementChar:						return kUnmappedChar;					case kOptionsUnmapped_UseReplacementCharWithWarning:						converter->warningStatus |= kStatus_UsedReplacement;						// fall through					default:	// case kOptionsUnmapped_UseReplacementCharSilently:						outputChar(READ(tableHeader->replacementChar));						break;				}			}			matchedLength = 1;		}	}	else if (ruleType == kLookupType_Unmapped) {		if (bOutputIsUnicode == bInputIsUnicode)			outputChar(inChar);		else {			switch (converter->unmappedBehavior) {				case kOptionsUnmapped_DontUseReplacementChar:					return kUnmappedChar;				case kOptionsUnmapped_UseReplacementCharWithWarning:					converter->warningStatus |= kStatus_UsedReplacement;					// fall through				default:	// case kOptionsUnmapped_UseReplacementCharSilently:					outputChar(READ(tableHeader->replacementChar));					break;			}		}	}	else {		// direct character output		if (bOutputIsUnicode) {			UInt32	usv = READ(lookup->usv);			if (usv <= 0x0010ffff)				outputChar(usv);		}		else {			for (int i = 0; i < READ(lookup->bytes.count); ++i)				outputChar(READ(lookup->bytes.data[i]));		}	}		advanceInput(matchedLength);	return 0;}#pragma mark --- class Converter ---Converter::Converter(const Byte* inTable, UInt32 inTableSize, bool inForward,						UInt16 inForm, UInt16 outForm)	: table(0)	, finalStage(0)	, forward(inForward)	, inputForm(inForm & kForm_EncodingFormMask)	, outputForm(outForm & kForm_EncodingFormMask)	, savedCount(0)	, pendingOutputChar(kInvalidChar)	, status(kStatus_NoError)	, warningStatus(0){	finalStage = this;	UInt16	normForm = 0;	if (inTable != 0) {		const FileHeader*	fh = (const FileHeader*)inTable;#ifndef NO_ZLIB		if (READ(fh->type) == kMagicNumberCmp) {			// the table is compressed; allocate a new buffer and decompress			unsigned long	uncompressedLen = READ(fh->version);			table = (Byte*)malloc(uncompressedLen);			if (table == 0) {				status = kStatus_OutOfMemory;				return;			}			int	result = uncompress(table, &uncompressedLen, inTable + 2 * sizeof(UInt32), inTableSize - 2 * sizeof(UInt32));			if (result != Z_OK) {				status = kStatus_InvalidMapping;				return;			}			fh = (const FileHeader*)table;		}#endif				if (READ(fh->type) != kMagicNumber) {			status = kStatus_InvalidMapping;			return;		}		if ((READ(fh->version) & 0xFFFF0000) > (kCurrentFileVersion & 0xFFFF0000)) {			status = kStatus_BadMappingVersion;			return;		}		if (table == 0) {			table = (Byte*)malloc(inTableSize);			if (table == 0) {				status = kStatus_OutOfMemory;				return;			}			memcpy(table, inTable, inTableSize);		}		fh = (const FileHeader*)table;		const UInt32*	nameOffsets = (const UInt32*)(table + sizeof(FileHeader));		const UInt32*	tableBase = nameOffsets + READ(fh->numNames);		UInt32			numTables = READ(fh->numFwdTables);		if (!forward) {			tableBase += numTables;			numTables = READ(fh->numRevTables);		}				// check that the outputForm matches the output of the mapping		UInt32	targetFlags = forward ? READ(fh->formFlagsRHS) : READ(fh->formFlagsLHS);		if ((targetFlags & kFlags_Unicode) != 0) {			if (outputForm < kForm_UTF8 || outputForm > kForm_UTF32LE) {				status = kStatus_InvalidForm;				return;			}		}		else {			if (outputForm != kForm_Bytes) {				status = kStatus_InvalidForm;				return;			}		}				// if converting from Unicode, prefix a Normalizer if the mapping wants it		UInt32	sourceFlags = forward ? READ(fh->formFlagsLHS) : READ(fh->formFlagsRHS);		if ((sourceFlags & kFlags_Unicode) != 0) {			// check that the inputForm is a Unicode form			if (inputForm < kForm_UTF8 || inputForm > kForm_UTF32LE) {				status = kStatus_InvalidForm;				return;			}			Stage*	n = 0;			if ((sourceFlags & kFlags_ExpectsNFD) != 0) {				n = new Normalizer(false);				normForm = kForm_NFD;			}			else if ((sourceFlags & kFlags_ExpectsNFC) != 0) {				n = new Normalizer(true);				normForm = kForm_NFC;			}			if (n != 0) {				n->prevStage = finalStage;				finalStage = n;			}		}		else {			// check that the inputForm is bytes			if (inputForm != kForm_Bytes) {				status = kStatus_InvalidForm;				return;			}		}		// create the processing pipeline		for (UInt32 i = 0; i < numTables; ++i) {			const TableHeader*	t = (const TableHeader*)(table + READ(tableBase[i]));			Stage*	p = 0;			switch (READ(t->type)) {				case kTableType_BB:				case kTableType_BU:				case kTableType_UU:				case kTableType_UB:					p = new Pass(t, this);					normForm = 0;					break;				case kTableType_NFC:					p = new Normalizer(true);					normForm = kForm_NFC;					break;				case kTableType_NFD:					p = new Normalizer(false);					normForm = kForm_NFD;					break;			}			if (p == 0) {				status = kStatus_InvalidMapping;				return;			}			p->prevStage = finalStage;			finalStage = p;		}	}	else {		// No mapping table provided, so we're mapping Unicode->Unicode,		// possibly doing normalization and/or encoding form change.		// Just check here that the input and output encoding forms are valid.		if (inputForm < kForm_UTF8 || inputForm > kForm_UTF32LE || outputForm < kForm_UTF8 || outputForm > kForm_UTF32LE) {			status = kStatus_InvalidForm;			return;		}	}	// if converting to Unicode, add a Normalizer pass at the end if requested	if (outputForm >= kForm_UTF8 && outputForm <= kForm_UTF32LE) {		Stage*	n = 0;		if ((outForm & kForm_NormalizationMask) == kForm_NFD && normForm != kForm_NFD)			n = new Normalizer(false);		else if ((outForm & kForm_NormalizationMask) == kForm_NFC && normForm != kForm_NFC)			n = new Normalizer(true);		if (n != 0) {			n->prevStage = finalStage;			finalStage = n;		}	}}Converter::~Converter(){	if (finalStage != this)		delete finalStage;	if (table != 0)		free(table);	table = 0;}static UInt32offsetsFromUTF8[6] =	{	0x00000000UL,	0x00003080UL,	0x000E2080UL, 	0x03C82080UL,	0xFA082080UL,	0x82082080UL};static UInt8bytesFromUTF8[256] = {	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};static UInt8firstByteMark[7] = {	0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};const int halfShift					= 10;const UInt32 halfBase				= 0x0010000UL;const UInt32 halfMask				= 0x3FFUL;const UInt32 kSurrogateHighStart	= 0xD800UL;const UInt32 kSurrogateHighEnd		= 0xDBFFUL;const UInt32 kSurrogateLowStart		= 0xDC00UL;const UInt32 kSurrogateLowEnd		= 0xDFFFUL;const UInt32 byteMask				= 0x000000BFUL;const UInt32 byteMark				= 0x00000080UL;UInt32Converter::getChar(){	if (dataPtr >= savedCount + dataLen)		return inputComplete ? kEndOfText : kNeedMoreInput;	if (inputForm == kForm_Bytes)		return data[dataPtr++];	return _getCharFn();}UInt32Converter::_getCharFn(){//	This is ONLY called from the public getChar() function, which has already done these tests:////	if (dataPtr >= dataLen)//		return inputComplete ? kEndOfText : kNeedMoreInput;//	//	if (inputForm == kForm_Bytes)//		return data[dataPtr++];		UInt32	rval = 0;	if (savedCount > 0) {	// the less efficient version is only called if really needed		rval = _getCharWithSavedBytes();		return rval;	}#define CHECK_AVAIL(x)				\	if (dataPtr + (x) > dataLen) {	\		if (inputComplete)			\			return kInvalidChar;	\		else {						\			_savePendingBytes();	\			return kNeedMoreInput;	\		}							\	}		switch (inputForm) {		case kForm_UTF8:			{				UInt16 extraBytes = bytesFromUTF8[data[dataPtr]];				CHECK_AVAIL(extraBytes + 1);				switch (extraBytes) {	// note: code falls through cases!					case 5:	rval += data[dataPtr++]; rval <<= 6;					case 4:	rval += data[dataPtr++]; rval <<= 6;					case 3:	rval += data[dataPtr++]; rval <<= 6;					case 2:	rval += data[dataPtr++]; rval <<= 6;					case 1:	rval += data[dataPtr++]; rval <<= 6;					case 0:	rval += data[dataPtr++];				};				rval -= offsetsFromUTF8[extraBytes];			}			break;		case kForm_UTF16BE:			CHECK_AVAIL(2);			rval = data[dataPtr++] << 8;			rval += data[dataPtr++];			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {				// check that 2 more bytes are available				dataPtr -= 2;				CHECK_AVAIL(4);	// if we don't have 4 bytes available, this will return with kNeedMoreInput,								// and we'll retry from the beginning of the high surrogate once more is available				dataPtr += 2;				UInt32	low = data[dataPtr++] << 8;				low += data[dataPtr++];				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;			}			break;		case kForm_UTF16LE:			CHECK_AVAIL(2);			rval = data[dataPtr++];			rval += data[dataPtr++] << 8;			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {				dataPtr -= 2;				CHECK_AVAIL(4);				dataPtr += 2;				UInt32	low = data[dataPtr++];				low += data[dataPtr++] << 8;				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;			}			break;		case kForm_UTF32BE:			CHECK_AVAIL(4);			rval = data[dataPtr++] << 24;			rval += data[dataPtr++] << 16;			rval += data[dataPtr++] << 8;			rval += data[dataPtr++];			break;		case kForm_UTF32LE:			CHECK_AVAIL(4);			rval = data[dataPtr++];			rval += data[dataPtr++] << 8;			rval += data[dataPtr++] << 16;			rval += data[dataPtr++] << 24;			break;	}	return rval;}UInt32Converter::_getCharWithSavedBytes()	// This is a version of _getCharFn() that respects "saved bytes";	// only call this if (savedCount > 0) because it has additional overhead for every byte read{	UInt32	rval = 0;#undef CHECK_AVAIL#define CHECK_AVAIL(x)							\	if (dataPtr + (x) > savedCount + dataLen) {	\		if (inputComplete)						\			return kInvalidChar;				\		else {									\			_savePendingBytes();				\			return kNeedMoreInput;				\		}										\	}#define DATA(x)	(x < savedCount ? savedBytes[x] : data[x - savedCount])	switch (inputForm) {		case kForm_UTF8:			{				UInt16 extraBytes = bytesFromUTF8[DATA(dataPtr)];				CHECK_AVAIL(extraBytes + 1);				switch (extraBytes) {	// note: code falls through cases!					case 5:	rval += DATA(dataPtr); dataPtr++; rval <<= 6;					case 4:	rval += DATA(dataPtr); dataPtr++; rval <<= 6;					case 3:	rval += DATA(dataPtr); dataPtr++; rval <<= 6;					case 2:	rval += DATA(dataPtr); dataPtr++; rval <<= 6;					case 1:	rval += DATA(dataPtr); dataPtr++; rval <<= 6;					case 0:	rval += DATA(dataPtr); dataPtr++;				};				rval -= offsetsFromUTF8[extraBytes];			}			break;		case kForm_UTF16BE:			CHECK_AVAIL(2);			rval = DATA(dataPtr) << 8; dataPtr++;			rval += DATA(dataPtr); dataPtr++;			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {				dataPtr -= 2;				CHECK_AVAIL(4);				dataPtr += 2;				UInt32	low = DATA(dataPtr) << 8; dataPtr++;				low += DATA(dataPtr); dataPtr++;				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;			}			break;		case kForm_UTF16LE:			CHECK_AVAIL(2);			rval = DATA(dataPtr); dataPtr++;			rval += DATA(dataPtr) << 8; dataPtr++;			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {				dataPtr -= 2;				CHECK_AVAIL(4);				dataPtr += 2;				UInt32	low = DATA(dataPtr); dataPtr++;				low += DATA(dataPtr) << 8; dataPtr++;				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;			}			break;
💿 文件大小 7978 K
👤 上传用户 oujk123
📂 所属分类其他行业
🏷️ 相关标签

#text #middleware-layer #information #retrieval
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -