📄 engine.cpp
字号:
} if (mr == matchYes) { // RULE MATCHED! execute it#ifdef TRACINGif (traceLevel > 0) { cerr << "** MATCHED:"; printMatch(rule); cerr << "\n"; cerr << "** RANGES:"; for (int i = 0; i < READ(rule->matchLength); ++i) { cerr << " <" << info[i].matchedSpan.start << ":" << info[i].matchedSpan.limit << ">"; } cerr << "\n"; cerr << "** REPLACEMENT:"; printRep(rule); cerr << "\n"; cerr << "** GENERATES:";}#endif const RepElem* r = (const RepElem*)(pattern + patternLength); for (int i = 0; i < READ(rule->repLength); ++i, ++r) {#ifdef TRACINGif (traceLevel > 0) cerr << " <";#endif switch (READ(r->flags.type)) { case kRepElem_Literal: outputChar(READ(r->value));#ifdef TRACINGif (traceLevel > 0) cerr << (int)READ(r->value);#endif break; case kRepElem_Class: { const MatchInfo& myInfo = info[READ(r->flags.matchIndex)]; if (myInfo.matchedSpan.start < myInfo.matchedSpan.limit) { outputChar(repClassMember(READ(r->flags.repClass), myInfo.classIndex));#ifdef TRACINGif (traceLevel > 0) cerr << (int)repClassMember(READ(r->flags.repClass), myInfo.classIndex);#endif } } break; case kRepElem_Copy: { const MatchInfo* myInfo = &info[READ(r->flags.matchIndex)]; for (int i = myInfo->matchedSpan.start; i < myInfo->matchedSpan.limit; ++i) { outputChar(inputChar(i));#ifdef TRACINGif (traceLevel > 0) cerr << (i > myInfo->matchedSpan.start ? "," : "") << (int)inputChar(i);#endif } } break; case kRepElem_Unmapped: if (bOutputIsUnicode == bInputIsUnicode) { outputChar(inChar);#ifdef TRACINGif (traceLevel > 0) cerr << (int)inChar;#endif } else { switch (converter->unmappedBehavior) { case kOptionsUnmapped_DontUseReplacementChar: return kUnmappedChar; case kOptionsUnmapped_UseReplacementCharWithWarning: converter->warningStatus |= kStatus_UsedReplacement; // fall through default: // case kOptionsUnmapped_UseReplacementCharSilently: outputChar(READ(tableHeader->replacementChar)); break; }#ifdef TRACINGif (traceLevel > 0) cerr << (int)READ(tableHeader->replacementChar);#endif } break; }#ifdef TRACINGif (traceLevel > 0) cerr << ">";#endif }#ifdef TRACINGif (traceLevel > 0) cerr << endl;#endif if (matchedLength > 0) { // we've matched the current input character, so break the loop matched = true; break; } else { // must have been an insertion (or null!) rule, so skip any further insertion rules allowInsertion = false; } } else if (mr != matchNo) { return mr; } } else if (mr != matchNo) { return mr; } } if (!matched) { // no rule matched the current input char, so we simulate a default "Unmapped" lookup if (bOutputIsUnicode == bInputIsUnicode) // B->B or U->U simply copies the input to the output outputChar(inChar); else { // B->U or U->B uses the replacement char or fails, depending on options switch (converter->unmappedBehavior) { case kOptionsUnmapped_DontUseReplacementChar: return kUnmappedChar; case kOptionsUnmapped_UseReplacementCharWithWarning: converter->warningStatus |= kStatus_UsedReplacement; // fall through default: // case kOptionsUnmapped_UseReplacementCharSilently: outputChar(READ(tableHeader->replacementChar)); break; } } matchedLength = 1; } } else if (ruleType == kLookupType_Unmapped) { if (bOutputIsUnicode == bInputIsUnicode) outputChar(inChar); else { switch (converter->unmappedBehavior) { case kOptionsUnmapped_DontUseReplacementChar: return kUnmappedChar; case kOptionsUnmapped_UseReplacementCharWithWarning: converter->warningStatus |= kStatus_UsedReplacement; // fall through default: // case kOptionsUnmapped_UseReplacementCharSilently: outputChar(READ(tableHeader->replacementChar)); break; } } } else { // direct character output if (bOutputIsUnicode) { UInt32 usv = READ(lookup->usv); if (usv <= 0x0010ffff) outputChar(usv); } else { for (int i = 0; i < READ(lookup->bytes.count); ++i) outputChar(READ(lookup->bytes.data[i])); } } advanceInput(matchedLength); return 0;}#pragma mark --- class Converter ---Converter::Converter(const Byte* inTable, UInt32 inTableSize, bool inForward, UInt16 inForm, UInt16 outForm) : table(0) , finalStage(0) , forward(inForward) , inputForm(inForm & kForm_EncodingFormMask) , outputForm(outForm & kForm_EncodingFormMask) , savedCount(0) , pendingOutputChar(kInvalidChar) , status(kStatus_NoError) , warningStatus(0){ finalStage = this; UInt16 normForm = 0; if (inTable != 0) { const FileHeader* fh = (const FileHeader*)inTable;#ifndef NO_ZLIB if (READ(fh->type) == kMagicNumberCmp) { // the table is compressed; allocate a new buffer and decompress unsigned long uncompressedLen = READ(fh->version); table = (Byte*)malloc(uncompressedLen); if (table == 0) { status = kStatus_OutOfMemory; return; } int result = uncompress(table, &uncompressedLen, inTable + 2 * sizeof(UInt32), inTableSize - 2 * sizeof(UInt32)); if (result != Z_OK) { status = kStatus_InvalidMapping; return; } fh = (const FileHeader*)table; }#endif if (READ(fh->type) != kMagicNumber) { status = kStatus_InvalidMapping; return; } if ((READ(fh->version) & 0xFFFF0000) > (kCurrentFileVersion & 0xFFFF0000)) { status = kStatus_BadMappingVersion; return; } if (table == 0) { table = (Byte*)malloc(inTableSize); if (table == 0) { status = kStatus_OutOfMemory; return; } memcpy(table, inTable, inTableSize); } fh = (const FileHeader*)table; const UInt32* nameOffsets = (const UInt32*)(table + sizeof(FileHeader)); const UInt32* tableBase = nameOffsets + READ(fh->numNames); UInt32 numTables = READ(fh->numFwdTables); if (!forward) { tableBase += numTables; numTables = READ(fh->numRevTables); } // check that the outputForm matches the output of the mapping UInt32 targetFlags = forward ? READ(fh->formFlagsRHS) : READ(fh->formFlagsLHS); if ((targetFlags & kFlags_Unicode) != 0) { if (outputForm < kForm_UTF8 || outputForm > kForm_UTF32LE) { status = kStatus_InvalidForm; return; } } else { if (outputForm != kForm_Bytes) { status = kStatus_InvalidForm; return; } } // if converting from Unicode, prefix a Normalizer if the mapping wants it UInt32 sourceFlags = forward ? READ(fh->formFlagsLHS) : READ(fh->formFlagsRHS); if ((sourceFlags & kFlags_Unicode) != 0) { // check that the inputForm is a Unicode form if (inputForm < kForm_UTF8 || inputForm > kForm_UTF32LE) { status = kStatus_InvalidForm; return; } Stage* n = 0; if ((sourceFlags & kFlags_ExpectsNFD) != 0) { n = new Normalizer(false); normForm = kForm_NFD; } else if ((sourceFlags & kFlags_ExpectsNFC) != 0) { n = new Normalizer(true); normForm = kForm_NFC; } if (n != 0) { n->prevStage = finalStage; finalStage = n; } } else { // check that the inputForm is bytes if (inputForm != kForm_Bytes) { status = kStatus_InvalidForm; return; } } // create the processing pipeline for (UInt32 i = 0; i < numTables; ++i) { const TableHeader* t = (const TableHeader*)(table + READ(tableBase[i])); Stage* p = 0; switch (READ(t->type)) { case kTableType_BB: case kTableType_BU: case kTableType_UU: case kTableType_UB: p = new Pass(t, this); normForm = 0; break; case kTableType_NFC: p = new Normalizer(true); normForm = kForm_NFC; break; case kTableType_NFD: p = new Normalizer(false); normForm = kForm_NFD; break; } if (p == 0) { status = kStatus_InvalidMapping; return; } p->prevStage = finalStage; finalStage = p; } } else { // No mapping table provided, so we're mapping Unicode->Unicode, // possibly doing normalization and/or encoding form change. // Just check here that the input and output encoding forms are valid. if (inputForm < kForm_UTF8 || inputForm > kForm_UTF32LE || outputForm < kForm_UTF8 || outputForm > kForm_UTF32LE) { status = kStatus_InvalidForm; return; } } // if converting to Unicode, add a Normalizer pass at the end if requested if (outputForm >= kForm_UTF8 && outputForm <= kForm_UTF32LE) { Stage* n = 0; if ((outForm & kForm_NormalizationMask) == kForm_NFD && normForm != kForm_NFD) n = new Normalizer(false); else if ((outForm & kForm_NormalizationMask) == kForm_NFC && normForm != kForm_NFC) n = new Normalizer(true); if (n != 0) { n->prevStage = finalStage; finalStage = n; } }}Converter::~Converter(){ if (finalStage != this) delete finalStage; if (table != 0) free(table); table = 0;}static UInt32offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL};static UInt8bytesFromUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};static UInt8firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};const int halfShift = 10;const UInt32 halfBase = 0x0010000UL;const UInt32 halfMask = 0x3FFUL;const UInt32 kSurrogateHighStart = 0xD800UL;const UInt32 kSurrogateHighEnd = 0xDBFFUL;const UInt32 kSurrogateLowStart = 0xDC00UL;const UInt32 kSurrogateLowEnd = 0xDFFFUL;const UInt32 byteMask = 0x000000BFUL;const UInt32 byteMark = 0x00000080UL;UInt32Converter::getChar(){ if (dataPtr >= savedCount + dataLen) return inputComplete ? kEndOfText : kNeedMoreInput; if (inputForm == kForm_Bytes) return data[dataPtr++]; return _getCharFn();}UInt32Converter::_getCharFn(){// This is ONLY called from the public getChar() function, which has already done these tests://// if (dataPtr >= dataLen)// return inputComplete ? kEndOfText : kNeedMoreInput;// // if (inputForm == kForm_Bytes)// return data[dataPtr++]; UInt32 rval = 0; if (savedCount > 0) { // the less efficient version is only called if really needed rval = _getCharWithSavedBytes(); return rval; }#define CHECK_AVAIL(x) \ if (dataPtr + (x) > dataLen) { \ if (inputComplete) \ return kInvalidChar; \ else { \ _savePendingBytes(); \ return kNeedMoreInput; \ } \ } switch (inputForm) { case kForm_UTF8: { UInt16 extraBytes = bytesFromUTF8[data[dataPtr]]; CHECK_AVAIL(extraBytes + 1); switch (extraBytes) { // note: code falls through cases! case 5: rval += data[dataPtr++]; rval <<= 6; case 4: rval += data[dataPtr++]; rval <<= 6; case 3: rval += data[dataPtr++]; rval <<= 6; case 2: rval += data[dataPtr++]; rval <<= 6; case 1: rval += data[dataPtr++]; rval <<= 6; case 0: rval += data[dataPtr++]; }; rval -= offsetsFromUTF8[extraBytes]; } break; case kForm_UTF16BE: CHECK_AVAIL(2); rval = data[dataPtr++] << 8; rval += data[dataPtr++]; if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) { // check that 2 more bytes are available dataPtr -= 2; CHECK_AVAIL(4); // if we don't have 4 bytes available, this will return with kNeedMoreInput, // and we'll retry from the beginning of the high surrogate once more is available dataPtr += 2; UInt32 low = data[dataPtr++] << 8; low += data[dataPtr++]; rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase; } break; case kForm_UTF16LE: CHECK_AVAIL(2); rval = data[dataPtr++]; rval += data[dataPtr++] << 8; if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) { dataPtr -= 2; CHECK_AVAIL(4); dataPtr += 2; UInt32 low = data[dataPtr++]; low += data[dataPtr++] << 8; rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase; } break; case kForm_UTF32BE: CHECK_AVAIL(4); rval = data[dataPtr++] << 24; rval += data[dataPtr++] << 16; rval += data[dataPtr++] << 8; rval += data[dataPtr++]; break; case kForm_UTF32LE: CHECK_AVAIL(4); rval = data[dataPtr++]; rval += data[dataPtr++] << 8; rval += data[dataPtr++] << 16; rval += data[dataPtr++] << 24; break; } return rval;}UInt32Converter::_getCharWithSavedBytes() // This is a version of _getCharFn() that respects "saved bytes"; // only call this if (savedCount > 0) because it has additional overhead for every byte read{ UInt32 rval = 0;#undef CHECK_AVAIL#define CHECK_AVAIL(x) \ if (dataPtr + (x) > savedCount + dataLen) { \ if (inputComplete) \ return kInvalidChar; \ else { \ _savePendingBytes(); \ return kNeedMoreInput; \ } \ }#define DATA(x) (x < savedCount ? savedBytes[x] : data[x - savedCount]) switch (inputForm) { case kForm_UTF8: { UInt16 extraBytes = bytesFromUTF8[DATA(dataPtr)]; CHECK_AVAIL(extraBytes + 1); switch (extraBytes) { // note: code falls through cases! case 5: rval += DATA(dataPtr); dataPtr++; rval <<= 6; case 4: rval += DATA(dataPtr); dataPtr++; rval <<= 6; case 3: rval += DATA(dataPtr); dataPtr++; rval <<= 6; case 2: rval += DATA(dataPtr); dataPtr++; rval <<= 6; case 1: rval += DATA(dataPtr); dataPtr++; rval <<= 6; case 0: rval += DATA(dataPtr); dataPtr++; }; rval -= offsetsFromUTF8[extraBytes]; } break; case kForm_UTF16BE: CHECK_AVAIL(2); rval = DATA(dataPtr) << 8; dataPtr++; rval += DATA(dataPtr); dataPtr++; if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) { dataPtr -= 2; CHECK_AVAIL(4); dataPtr += 2; UInt32 low = DATA(dataPtr) << 8; dataPtr++; low += DATA(dataPtr); dataPtr++; rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase; } break; case kForm_UTF16LE: CHECK_AVAIL(2); rval = DATA(dataPtr); dataPtr++; rval += DATA(dataPtr) << 8; dataPtr++; if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) { dataPtr -= 2; CHECK_AVAIL(4); dataPtr += 2; UInt32 low = DATA(dataPtr); dataPtr++; low += DATA(dataPtr) << 8; dataPtr++; rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase; } break;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -