📄 engine.cpp

📁 Emdros is a text database middleware-layer aimed at storage and retrieval of "text plus information
💻 CPP
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/*------------------------------------------------------------------------Copyright (C) 2002-2004 SIL International. All rights reserved.Distributable under the terms of either the Common Public License or theGNU Lesser General Public License, as specified in the LICENSING.txt file.File: Engine.cpResponsibility: Jonathan KewLast reviewed: Not yet.Description:    Implements the TECkit conversion engine.-------------------------------------------------------------------------*//*	2008-01-23  jk  revised endian-ness stuff to allow Universal build	2006-06-02	jk	added support for extended string rules (>255 per initial char)	2006-06-02	jk	fixed bug handling passes with no mapping rules	2006-01-12	jk	remove multi-char constants, use kTableType_XXX from TECkit_Format.h	2005-07-19	jk	revised to use WORDS_BIGENDIAN conditional, config.h	2005-05-06	jk	patched match() to forget matches within groups if we backtrack out	2004-03-19	jk	rewrote match() to fix group/repeat bugs and be more efficient	2004-03-12	jk	finished updating for version 2.1 with ...Opt APIs*///#define TRACING	1//#ifdef HAVE_CONFIG_H//#	include "config.h"	/* a Unix-ish setup where we have config.h available *///#endif#if	(defined(__dest_os) && (__dest_os == __win32_os)) || defined(WIN32)	/* Windows target: little-endian */#	undef WORDS_BIGENDIAN#endif#ifdef __APPLE__#include <TargetConditionals.h>#endif#if defined(TARGET_RT_BIG_ENDIAN)	/* the CodeWarrior prefix files or Apple TargetConditionals.h sets this */#	if TARGET_RT_BIG_ENDIAN#		undef WORDS_BIGENDIAN#		define WORDS_BIGENDIAN 1#	else#		undef WORDS_BIGENDIAN#	endif#endif#if	(defined(__dest_os) && (__dest_os == __win32_os)) || defined(WIN32)#	define WIN32_LEAN_AND_MEAN#	define NOSERVICE#	define NOMCX#	include <Windows.h>	BOOL WINAPI	DllMain(HINSTANCE /*hInst*/, DWORD /*wDataSeg*/, LPVOID /*lpReserved*/)	{		return true;	}#endif#include "Engine.h"#ifdef TRACING#include <iostream>int	traceLevel = 1;#endif#include <cstdlib>#include <cstring>#include <algorithm>#ifndef NO_ZLIB#include "zlib.h"#endifusing namespace std;/* we apply READ to values read from the compiled table, to provide byte-swapping where needed */static inline UInt8READ(const UInt8 p){	return p;}static inline UInt16READ(const UInt16 p){#ifdef WORDS_BIGENDIAN	return p;#else	return (p >> 8) + (p << 8);#endif}static inline UInt32READ(const UInt32 p){#ifdef WORDS_BIGENDIAN	return p;#else	return (p >> 24) + ((p >> 8) & 0x0000ff00) + ((p << 8) & 0x00ff0000) + (p << 24);#endif}#pragma mark --- class Stage ---Stage::Stage()	: oBuffer(0)	, oBufSize(0)	, oBufEnd(0)	, oBufPtr(0)	, prevStage(0){}Stage::~Stage(){	if (prevStage && prevStage->prevStage)		delete prevStage;}UInt32Stage::lookaheadCount() const{	return 0;}#pragma mark --- class Normalizer ---#include "NormalizationData.c"Normalizer::Normalizer(bool compose)	: prevCombClass(0)	, oBufSafe(0)	, bCompose(compose){	oBufSize = 256;	oBuffer = new UInt32[oBufSize];}Normalizer::~Normalizer(){	delete[] oBuffer;}/* constants for algorithmic Hangul decomposition */#define	SBase	0xAC00#define	LBase	0x1100#define	VBase	0x1161#define	TBase	0x11A7#define	LCount	19#define	VCount	21#define	TCount	28#define	NCount	(VCount * TCount)#define	SCount	(LCount * NCount)UInt32Normalizer::process(){	UInt32	inChar = prevStage->getChar();	if (inChar == kNeedMoreInput || inChar == kInvalidChar || inChar == kUnmappedChar)		return inChar;	if (inChar == kEndOfText) {		generateChar(kEndOfText);		return inChar;	}	UInt32	SIndex = inChar - SBase;	if (SIndex >= SCount)		decompose(inChar);	else {		generateChar(LBase + SIndex / NCount);		generateChar(VBase + (SIndex % NCount) / TCount);		UInt32 T = SIndex % TCount;		if (T != 0)			generateChar(TBase + T);	}		return 0;}voidNormalizer::Reset(){	oBufPtr = oBufEnd = 0;	prevCombClass = 0;	oBufSafe = 0;}voidNormalizer::decompose(UInt32 c){	UInt32	prefix = decomposeOne(c);	if (prefix != 0xffff)		decompose(prefix);	if (c != 0xffff)		generateChar(c);}UInt32Normalizer::decomposeOne(UInt32& c){	UInt32	plane = c >> 16;	UInt32	page = (c >> 8) & 0xff;	UInt32	ch = c & 0xff;		UInt16	charIndex = dcCharIndex[dcPageMaps[dcPlaneMap[plane]][page]][ch];	if (charIndex == 0)		return 0xffff;	c = dcDecomposition[charIndex][1];	return dcDecomposition[charIndex][0];}voidNormalizer::generateChar(UInt32 c){	int	combClass = 0;	if (c != kEndOfText) {		UInt32	plane = c >> 16;		UInt32	page = (c >> 8) & 0xff;		UInt32	ch = c & 0xff;		combClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch];	}		if (combClass != 0) {		// combiners are always buffered for sorting and possible composition		if (prevCombClass <= combClass) {			appendChar(c);			prevCombClass = combClass;		}		else			insertChar(c, combClass);	}	else {		if (bCompose) {			if (oBufEnd > 0) {				// check whether last buffered char and current char should form Hangul syllable				UInt32	last = oBuffer[oBufEnd - 1];				// 1. check to see if two current characters are L and V				UInt32	LIndex = last - LBase;				if (LIndex < LCount) {					UInt32	VIndex = c - VBase;					if (VIndex < VCount) {						// make syllable of form LV						last = SBase + (LIndex * VCount + VIndex) * TCount;						oBuffer[oBufEnd - 1] = last; // reset last						return; // don't append c, and don't update oBufSafe as a following V would compose					}				}				// 2. check to see if two current characters are LV and T				UInt32	SIndex = last - SBase;				if (SIndex < SCount && (SIndex % TCount) == 0) {					UInt32	TIndex = c - TBase;					if (TIndex <= TCount) {						// make syllable of form LVT						last += TIndex;						oBuffer[oBufEnd - 1] = last; // reset last						oBufSafe = oBufEnd;	// no more composition will be possible now						return; // don't append c					}				}			}			// search for canonical compositions in the buffered text, and update oBufSafe if possible			compose();		}		else			oBufSafe = oBufEnd;		appendChar(c);		if (c == kEndOfText)			oBufSafe = oBufEnd;		prevCombClass = 0;	}}voidNormalizer::appendChar(UInt32 c){	/* unlikely that we'd ever need to do this--it would take a long string of non-spacing marks! */	if (oBufEnd == oBufSize)		growOutBuf();	oBuffer[oBufEnd++] = c;}voidNormalizer::insertChar(UInt32 insCh, int insCombClass){	if (oBufEnd == oBufSize)		growOutBuf();	UInt32 i;	for (i = oBufEnd - 1; i > 0; --i) {		UInt32	c = oBuffer[i];		UInt32	plane = c >> 16;		UInt32	page = (c >> 8) & 0xff;		UInt32	ch = c & 0xff;		int	combClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch];		if (insCombClass >= combClass)			break;	}	++i;		for (UInt32 j = oBufEnd; j > i; --j)		oBuffer[j] = oBuffer[j - 1];	oBuffer[i] = insCh;	oBufEnd++;}voidNormalizer::growOutBuf(){	UInt32	newSize = oBufSize + 256;	UInt32*	newBuf = new UInt32[newSize];	for (long i = 0; i < oBufSize; ++i)		newBuf[i] = oBuffer[i];	delete[] oBuffer;	oBuffer = newBuf;	oBufSize = newSize;}voidNormalizer::compose(){	// search for compositions in oBuffer up to oBufEnd	UInt32	starterPos = 0;	UInt32	c = oBuffer[0];	UInt32	plane = c >> 16;	UInt32	page = (c >> 8) & 0xff;	UInt32	ch = c & 0xff;	int		lastClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch];	if (lastClass != 0)		lastClass = 256;	if (oBufEnd > 1) {		UInt32	compPos = 1;		UInt16	li = cLCharIndex[cLPageMaps[cLPlaneMap[plane]][page]][ch];	    for (long decompPos = 1; decompPos < oBufEnd; ++decompPos) {			c = oBuffer[decompPos];			plane = c >> 16;			page = (c >> 8) & 0xff;			ch = c & 0xff;			int		chClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch];			UInt16	ri = cRCharIndex[cRPageMaps[cRPlaneMap[plane]][page]][ch];	        UInt32	cmp = cComposites[li][ri];			if (cmp != 0 && (lastClass < chClass || lastClass == 0)) {	            oBuffer[starterPos] = cmp;				plane = cmp >> 16;				page = (cmp >> 8) & 0xff;				ch = cmp & 0xff;				li = cLCharIndex[cLPageMaps[cLPlaneMap[plane]][page]][ch];	        }	        else {	            if (chClass == 0) {	                starterPos = compPos;					plane = c >> 16;					page = (c >> 8) & 0xff;					ch = c & 0xff;					li = cLCharIndex[cLPageMaps[cLPlaneMap[plane]][page]][ch];	            }	            lastClass = chClass;	            oBuffer[compPos++] = c;	        }	    }	    oBufEnd = compPos;	}	        // update oBufSafe to pass any chars that definitely can't compose	if (lastClass != 0)		oBufSafe = oBufEnd;	else		oBufSafe = starterPos;}UInt32Normalizer::getChar(){	UInt32	c;	while (oBufSafe == 0) {		c = process();		if (c == kNeedMoreInput || c == kInvalidChar || c == kUnmappedChar)			return c;	}	c = oBuffer[oBufPtr++];	if (oBufPtr == oBufSafe) {		for (long i = oBufPtr; i < oBufEnd; ++i)			oBuffer[i - oBufPtr] = oBuffer[i];		oBufEnd -= oBufPtr;		oBufSafe = oBufPtr = 0;	}	return c;}#pragma mark --- class Pass ---Pass::Pass(const TableHeader* inTable, Converter* cnv)	: converter(cnv)	, tableHeader(inTable)	, iBuffer(0)	, iBufSize(0)	, iBufStart(0)	, iBufEnd(0)	, iBufPtr(0){	bInputIsUnicode 	= ((READ(tableHeader->type) & 0xFF000000) >> 24) == 'U';	bOutputIsUnicode	= (READ(tableHeader->type) & 0x000000FF) == 'U';	bSupplementaryChars	= (READ(tableHeader->flags) & kTableFlags_Supplementary) != 0;	numPageMaps = 1;	pageBase		= (const Byte*)tableHeader + READ(tableHeader->pageBase);	lookupBase		= (const Lookup*)((const Byte*)tableHeader + READ(tableHeader->lookupBase));	matchClassBase	= (const Byte*)tableHeader + READ(tableHeader->matchClassBase);	repClassBase	= (const Byte*)tableHeader + READ(tableHeader->repClassBase);	stringListBase	= (const Byte*)tableHeader + READ(tableHeader->stringListBase);	stringRuleData	= (const Byte*)tableHeader + READ(tableHeader->stringRuleData);	if (bInputIsUnicode && bSupplementaryChars) {		// support supplementary plane chars		planeMap = pageBase;		pageBase += 20;		numPageMaps = READ(*(planeMap + 17));	}		iBufSize = (READ(inTable->maxMatch) + READ(inTable->maxPre) + READ(inTable->maxPost) + 7) & ~0x0003;	iBuffer = new UInt32[iBufSize];	oBufSize = (READ(inTable->maxOutput) + 7) & ~0x0003;	oBuffer = new UInt32[oBufSize];}Pass::~Pass(){	delete[] oBuffer;	delete[] iBuffer;}voidPass::Reset(){	iBufStart = iBufEnd = iBufPtr = 0;	oBufPtr = oBufEnd = 0;}UInt32Pass::getChar()	// called by next Pass when it wants the next character from us{	while (oBufPtr == oBufEnd) {		oBufPtr = oBufEnd = 0;		UInt32	c = DoMapping();		if (c == kNeedMoreInput || c == kInvalidChar || c == kUnmappedChar)			return c;	}	return oBuffer[oBufPtr++];}voidPass::outputChar(UInt32 c)	// Called by DoMapping to generate a character in the output stream{	if (oBufEnd < oBufSize)		oBuffer[oBufEnd++] = c;			// Cannot overflow provided the table correctly declares maxOutput			// (so the compiler had better get it right!)}UInt32Pass::lookaheadCount() const	// return how many characters of lookahead this pass has in its input buffer{	return iBufEnd < iBufPtr		? // iBufEnd has wrapped but iBufPtr hasn't			iBufEnd + (iBufSize - iBufPtr)		: // pointers are in the "normal" order			iBufEnd - iBufPtr;}UInt32Pass::inputChar(long inIndex)	// Called by DoMapping or match to read the character at a given location	// relative to the current input stream location{	long	target = iBufPtr + inIndex;	if (inIndex < 0) {		// look back		if (target < 0)			target += iBufSize;		if (iBufPtr < iBufStart) {			// iBufPtr has wrapped back to beginning of buffer, leaving iBufStart beyond it			// so the valid pre-context is from iBufStart to iBufSize-1 and 0 to iBufPtr-1			if (target >= iBufStart || target < iBufPtr)				return iBuffer[target];		}		else {			// iBufPtr points beyond iBufStart			// so the valid pre-context is from iBufStart to iBufPtr-1			if (target >= iBufStart && target < iBufPtr)				return iBuffer[target];		}		return kEndOfText;	}	else {		// look ahead		if (target >= iBufSize)			target -= iBufSize;		if (iBufPtr == iBufEnd) {			// ensure that current character is actually available			UInt32	ch = prevStage->getChar();			if (ch == kNeedMoreInput || ch == kInvalidChar || ch == kUnmappedChar)				return ch;	// don't put this into iBuffer!			iBuffer[iBufEnd++] = ch;			if (iBufEnd == iBufSize)				iBufEnd = 0;			if (iBufEnd == iBufStart) {				++iBufStart;				if (iBufStart == iBufSize)					iBufStart = 0;			}		}		long	index = iBufPtr;		while (index != target) {			// scan forward as far as necessary, reading in required chars			if (index == iBufSize - 1)				index = 0;			else				++index;			if (index == iBufEnd) {				UInt32	ch = prevStage->getChar();				if (ch == kNeedMoreInput || ch == kInvalidChar || ch == kUnmappedChar)					return ch;				iBuffer[iBufEnd++] = ch;				if (iBufEnd == iBufSize)					iBufEnd = 0;				if (iBufEnd == iBufStart) {					++iBufStart;					if (iBufStart == iBufSize)						iBufStart = 0;				}			}		}		return iBuffer[index];	}	return kEndOfText;}voidPass::advanceInput(unsigned int numChars)	// Called by DoMapping to move forward in the input stream
12 3 4 下一页
💿 文件大小 7978 K
👤 上传用户 oujk123
📂 所属分类其他行业
🏷️ 相关标签

#text #middleware-layer #information #retrieval
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -