📄 engine.cpp
字号:
/*------------------------------------------------------------------------Copyright (C) 2002-2004 SIL International. All rights reserved.Distributable under the terms of either the Common Public License or theGNU Lesser General Public License, as specified in the LICENSING.txt file.File: Engine.cpResponsibility: Jonathan KewLast reviewed: Not yet.Description: Implements the TECkit conversion engine.-------------------------------------------------------------------------*//* 2008-01-23 jk revised endian-ness stuff to allow Universal build 2006-06-02 jk added support for extended string rules (>255 per initial char) 2006-06-02 jk fixed bug handling passes with no mapping rules 2006-01-12 jk remove multi-char constants, use kTableType_XXX from TECkit_Format.h 2005-07-19 jk revised to use WORDS_BIGENDIAN conditional, config.h 2005-05-06 jk patched match() to forget matches within groups if we backtrack out 2004-03-19 jk rewrote match() to fix group/repeat bugs and be more efficient 2004-03-12 jk finished updating for version 2.1 with ...Opt APIs*///#define TRACING 1//#ifdef HAVE_CONFIG_H//# include "config.h" /* a Unix-ish setup where we have config.h available *///#endif#if (defined(__dest_os) && (__dest_os == __win32_os)) || defined(WIN32) /* Windows target: little-endian */# undef WORDS_BIGENDIAN#endif#ifdef __APPLE__#include <TargetConditionals.h>#endif#if defined(TARGET_RT_BIG_ENDIAN) /* the CodeWarrior prefix files or Apple TargetConditionals.h sets this */# if TARGET_RT_BIG_ENDIAN# undef WORDS_BIGENDIAN# define WORDS_BIGENDIAN 1# else# undef WORDS_BIGENDIAN# endif#endif#if (defined(__dest_os) && (__dest_os == __win32_os)) || defined(WIN32)# define WIN32_LEAN_AND_MEAN# define NOSERVICE# define NOMCX# include <Windows.h> BOOL WINAPI DllMain(HINSTANCE /*hInst*/, DWORD /*wDataSeg*/, LPVOID /*lpReserved*/) { return true; }#endif#include "Engine.h"#ifdef TRACING#include <iostream>int traceLevel = 1;#endif#include <cstdlib>#include <cstring>#include <algorithm>#ifndef NO_ZLIB#include "zlib.h"#endifusing namespace std;/* we apply READ to values read from the compiled table, to provide byte-swapping where needed */static inline UInt8READ(const UInt8 p){ return p;}static inline UInt16READ(const UInt16 p){#ifdef WORDS_BIGENDIAN return p;#else return (p >> 8) + (p << 8);#endif}static inline UInt32READ(const UInt32 p){#ifdef WORDS_BIGENDIAN return p;#else return (p >> 24) + ((p >> 8) & 0x0000ff00) + ((p << 8) & 0x00ff0000) + (p << 24);#endif}#pragma mark --- class Stage ---Stage::Stage() : oBuffer(0) , oBufSize(0) , oBufEnd(0) , oBufPtr(0) , prevStage(0){}Stage::~Stage(){ if (prevStage && prevStage->prevStage) delete prevStage;}UInt32Stage::lookaheadCount() const{ return 0;}#pragma mark --- class Normalizer ---#include "NormalizationData.c"Normalizer::Normalizer(bool compose) : prevCombClass(0) , oBufSafe(0) , bCompose(compose){ oBufSize = 256; oBuffer = new UInt32[oBufSize];}Normalizer::~Normalizer(){ delete[] oBuffer;}/* constants for algorithmic Hangul decomposition */#define SBase 0xAC00#define LBase 0x1100#define VBase 0x1161#define TBase 0x11A7#define LCount 19#define VCount 21#define TCount 28#define NCount (VCount * TCount)#define SCount (LCount * NCount)UInt32Normalizer::process(){ UInt32 inChar = prevStage->getChar(); if (inChar == kNeedMoreInput || inChar == kInvalidChar || inChar == kUnmappedChar) return inChar; if (inChar == kEndOfText) { generateChar(kEndOfText); return inChar; } UInt32 SIndex = inChar - SBase; if (SIndex >= SCount) decompose(inChar); else { generateChar(LBase + SIndex / NCount); generateChar(VBase + (SIndex % NCount) / TCount); UInt32 T = SIndex % TCount; if (T != 0) generateChar(TBase + T); } return 0;}voidNormalizer::Reset(){ oBufPtr = oBufEnd = 0; prevCombClass = 0; oBufSafe = 0;}voidNormalizer::decompose(UInt32 c){ UInt32 prefix = decomposeOne(c); if (prefix != 0xffff) decompose(prefix); if (c != 0xffff) generateChar(c);}UInt32Normalizer::decomposeOne(UInt32& c){ UInt32 plane = c >> 16; UInt32 page = (c >> 8) & 0xff; UInt32 ch = c & 0xff; UInt16 charIndex = dcCharIndex[dcPageMaps[dcPlaneMap[plane]][page]][ch]; if (charIndex == 0) return 0xffff; c = dcDecomposition[charIndex][1]; return dcDecomposition[charIndex][0];}voidNormalizer::generateChar(UInt32 c){ int combClass = 0; if (c != kEndOfText) { UInt32 plane = c >> 16; UInt32 page = (c >> 8) & 0xff; UInt32 ch = c & 0xff; combClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch]; } if (combClass != 0) { // combiners are always buffered for sorting and possible composition if (prevCombClass <= combClass) { appendChar(c); prevCombClass = combClass; } else insertChar(c, combClass); } else { if (bCompose) { if (oBufEnd > 0) { // check whether last buffered char and current char should form Hangul syllable UInt32 last = oBuffer[oBufEnd - 1]; // 1. check to see if two current characters are L and V UInt32 LIndex = last - LBase; if (LIndex < LCount) { UInt32 VIndex = c - VBase; if (VIndex < VCount) { // make syllable of form LV last = SBase + (LIndex * VCount + VIndex) * TCount; oBuffer[oBufEnd - 1] = last; // reset last return; // don't append c, and don't update oBufSafe as a following V would compose } } // 2. check to see if two current characters are LV and T UInt32 SIndex = last - SBase; if (SIndex < SCount && (SIndex % TCount) == 0) { UInt32 TIndex = c - TBase; if (TIndex <= TCount) { // make syllable of form LVT last += TIndex; oBuffer[oBufEnd - 1] = last; // reset last oBufSafe = oBufEnd; // no more composition will be possible now return; // don't append c } } } // search for canonical compositions in the buffered text, and update oBufSafe if possible compose(); } else oBufSafe = oBufEnd; appendChar(c); if (c == kEndOfText) oBufSafe = oBufEnd; prevCombClass = 0; }}voidNormalizer::appendChar(UInt32 c){ /* unlikely that we'd ever need to do this--it would take a long string of non-spacing marks! */ if (oBufEnd == oBufSize) growOutBuf(); oBuffer[oBufEnd++] = c;}voidNormalizer::insertChar(UInt32 insCh, int insCombClass){ if (oBufEnd == oBufSize) growOutBuf(); UInt32 i; for (i = oBufEnd - 1; i > 0; --i) { UInt32 c = oBuffer[i]; UInt32 plane = c >> 16; UInt32 page = (c >> 8) & 0xff; UInt32 ch = c & 0xff; int combClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch]; if (insCombClass >= combClass) break; } ++i; for (UInt32 j = oBufEnd; j > i; --j) oBuffer[j] = oBuffer[j - 1]; oBuffer[i] = insCh; oBufEnd++;}voidNormalizer::growOutBuf(){ UInt32 newSize = oBufSize + 256; UInt32* newBuf = new UInt32[newSize]; for (long i = 0; i < oBufSize; ++i) newBuf[i] = oBuffer[i]; delete[] oBuffer; oBuffer = newBuf; oBufSize = newSize;}voidNormalizer::compose(){ // search for compositions in oBuffer up to oBufEnd UInt32 starterPos = 0; UInt32 c = oBuffer[0]; UInt32 plane = c >> 16; UInt32 page = (c >> 8) & 0xff; UInt32 ch = c & 0xff; int lastClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch]; if (lastClass != 0) lastClass = 256; if (oBufEnd > 1) { UInt32 compPos = 1; UInt16 li = cLCharIndex[cLPageMaps[cLPlaneMap[plane]][page]][ch]; for (long decompPos = 1; decompPos < oBufEnd; ++decompPos) { c = oBuffer[decompPos]; plane = c >> 16; page = (c >> 8) & 0xff; ch = c & 0xff; int chClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch]; UInt16 ri = cRCharIndex[cRPageMaps[cRPlaneMap[plane]][page]][ch]; UInt32 cmp = cComposites[li][ri]; if (cmp != 0 && (lastClass < chClass || lastClass == 0)) { oBuffer[starterPos] = cmp; plane = cmp >> 16; page = (cmp >> 8) & 0xff; ch = cmp & 0xff; li = cLCharIndex[cLPageMaps[cLPlaneMap[plane]][page]][ch]; } else { if (chClass == 0) { starterPos = compPos; plane = c >> 16; page = (c >> 8) & 0xff; ch = c & 0xff; li = cLCharIndex[cLPageMaps[cLPlaneMap[plane]][page]][ch]; } lastClass = chClass; oBuffer[compPos++] = c; } } oBufEnd = compPos; } // update oBufSafe to pass any chars that definitely can't compose if (lastClass != 0) oBufSafe = oBufEnd; else oBufSafe = starterPos;}UInt32Normalizer::getChar(){ UInt32 c; while (oBufSafe == 0) { c = process(); if (c == kNeedMoreInput || c == kInvalidChar || c == kUnmappedChar) return c; } c = oBuffer[oBufPtr++]; if (oBufPtr == oBufSafe) { for (long i = oBufPtr; i < oBufEnd; ++i) oBuffer[i - oBufPtr] = oBuffer[i]; oBufEnd -= oBufPtr; oBufSafe = oBufPtr = 0; } return c;}#pragma mark --- class Pass ---Pass::Pass(const TableHeader* inTable, Converter* cnv) : converter(cnv) , tableHeader(inTable) , iBuffer(0) , iBufSize(0) , iBufStart(0) , iBufEnd(0) , iBufPtr(0){ bInputIsUnicode = ((READ(tableHeader->type) & 0xFF000000) >> 24) == 'U'; bOutputIsUnicode = (READ(tableHeader->type) & 0x000000FF) == 'U'; bSupplementaryChars = (READ(tableHeader->flags) & kTableFlags_Supplementary) != 0; numPageMaps = 1; pageBase = (const Byte*)tableHeader + READ(tableHeader->pageBase); lookupBase = (const Lookup*)((const Byte*)tableHeader + READ(tableHeader->lookupBase)); matchClassBase = (const Byte*)tableHeader + READ(tableHeader->matchClassBase); repClassBase = (const Byte*)tableHeader + READ(tableHeader->repClassBase); stringListBase = (const Byte*)tableHeader + READ(tableHeader->stringListBase); stringRuleData = (const Byte*)tableHeader + READ(tableHeader->stringRuleData); if (bInputIsUnicode && bSupplementaryChars) { // support supplementary plane chars planeMap = pageBase; pageBase += 20; numPageMaps = READ(*(planeMap + 17)); } iBufSize = (READ(inTable->maxMatch) + READ(inTable->maxPre) + READ(inTable->maxPost) + 7) & ~0x0003; iBuffer = new UInt32[iBufSize]; oBufSize = (READ(inTable->maxOutput) + 7) & ~0x0003; oBuffer = new UInt32[oBufSize];}Pass::~Pass(){ delete[] oBuffer; delete[] iBuffer;}voidPass::Reset(){ iBufStart = iBufEnd = iBufPtr = 0; oBufPtr = oBufEnd = 0;}UInt32Pass::getChar() // called by next Pass when it wants the next character from us{ while (oBufPtr == oBufEnd) { oBufPtr = oBufEnd = 0; UInt32 c = DoMapping(); if (c == kNeedMoreInput || c == kInvalidChar || c == kUnmappedChar) return c; } return oBuffer[oBufPtr++];}voidPass::outputChar(UInt32 c) // Called by DoMapping to generate a character in the output stream{ if (oBufEnd < oBufSize) oBuffer[oBufEnd++] = c; // Cannot overflow provided the table correctly declares maxOutput // (so the compiler had better get it right!)}UInt32Pass::lookaheadCount() const // return how many characters of lookahead this pass has in its input buffer{ return iBufEnd < iBufPtr ? // iBufEnd has wrapped but iBufPtr hasn't iBufEnd + (iBufSize - iBufPtr) : // pointers are in the "normal" order iBufEnd - iBufPtr;}UInt32Pass::inputChar(long inIndex) // Called by DoMapping or match to read the character at a given location // relative to the current input stream location{ long target = iBufPtr + inIndex; if (inIndex < 0) { // look back if (target < 0) target += iBufSize; if (iBufPtr < iBufStart) { // iBufPtr has wrapped back to beginning of buffer, leaving iBufStart beyond it // so the valid pre-context is from iBufStart to iBufSize-1 and 0 to iBufPtr-1 if (target >= iBufStart || target < iBufPtr) return iBuffer[target]; } else { // iBufPtr points beyond iBufStart // so the valid pre-context is from iBufStart to iBufPtr-1 if (target >= iBufStart && target < iBufPtr) return iBuffer[target]; } return kEndOfText; } else { // look ahead if (target >= iBufSize) target -= iBufSize; if (iBufPtr == iBufEnd) { // ensure that current character is actually available UInt32 ch = prevStage->getChar(); if (ch == kNeedMoreInput || ch == kInvalidChar || ch == kUnmappedChar) return ch; // don't put this into iBuffer! iBuffer[iBufEnd++] = ch; if (iBufEnd == iBufSize) iBufEnd = 0; if (iBufEnd == iBufStart) { ++iBufStart; if (iBufStart == iBufSize) iBufStart = 0; } } long index = iBufPtr; while (index != target) { // scan forward as far as necessary, reading in required chars if (index == iBufSize - 1) index = 0; else ++index; if (index == iBufEnd) { UInt32 ch = prevStage->getChar(); if (ch == kNeedMoreInput || ch == kInvalidChar || ch == kUnmappedChar) return ch; iBuffer[iBufEnd++] = ch; if (iBufEnd == iBufSize) iBufEnd = 0; if (iBufEnd == iBufStart) { ++iBufStart; if (iBufStart == iBufSize) iBufStart = 0; } } } return iBuffer[index]; } return kEndOfText;}voidPass::advanceInput(unsigned int numChars) // Called by DoMapping to move forward in the input stream
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -