📄 compiler.cpp

📁 Emdros is a text database middleware-layer aimed at storage and retrieval of "text plus information
💻 CPP
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/*------------------------------------------------------------------------Copyright (C) 2002-2004 SIL International. All rights reserved.Distributable under the terms of either the Common Public License or theGNU Lesser General Public License, as specified in the LICENSING.txt file.File: Compiler.cppResponsibility: Jonathan KewLast reviewed: Not yet.Description:    Implements the TECkit mapping compiler.-------------------------------------------------------------------------*//*	2006-06-19	jk			added new APIs to look up Unicode names	2006-01-12	jk			removed multi-char constants, use FOUR_CHAR_CODE to define UInt32 values instead							(no functional change, just to avoid compiler warnings)    2005-07-07  jk  2.1.5   changed to use WORDS_BIGENDIAN rather than TARGET_RT_BIG_ENDIAN    2005-06-20  jk  2.1.4   added lhsDefault/rhsDefault attributes to <pass> elem in xml output	23-May-2005		changes for 64-bit compilation, from Ulrik P	21-May-2005		changes based on Ulrik Petersen's patch for MS VC++ 6    2004-11-11  jk  2.1.3   added support for XML export	2004-07-21	jk	2.1.2	removed trailing spaces from 2 names in UnicodeNames.cpp	2004-06-16	jk	2.1.1	fixed bug of ignoring char after '_'	2004-03-12	jk	2.1		updated for version 2.1 with ...Opt APIs							modified compiler to accept Unicode source text*/#include "Compiler.h"#include <iostream>#include <iomanip>#include <algorithm>#include <cstring>#ifndef NO_ZLIB#include "zlib.h"#endifconst UInt32	kInvalidChar	= 0xfffffffdUL;static UInt32offsetsFromUTF8[6] =	{	0x00000000UL,	0x00003080UL,	0x000E2080UL, 	0x03C82080UL,	0xFA082080UL,	0x82082080UL};static UInt8bytesFromUTF8[256] = {	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};static UInt8firstByteMark[7] = {	0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};const int halfShift					= 10;const UInt32 halfBase				= 0x0010000UL;const UInt32 halfMask				= 0x3FFUL;const UInt32 kSurrogateHighStart	= 0xD800UL;const UInt32 kSurrogateHighEnd		= 0xDBFFUL;const UInt32 kSurrogateLowStart		= 0xDC00UL;const UInt32 kSurrogateLowEnd		= 0xDFFFUL;const UInt32 byteMask				= 0x000000BFUL;const UInt32 byteMark				= 0x00000080UL;#define FOUR_CHAR_CODE(a,b,c,d)	(UInt32)((a << 24) + (b << 16) + (c << 8) + d)const UInt32 kCode_Byte	= FOUR_CHAR_CODE('B','y','t','e');const UInt32 kCode_BU	= FOUR_CHAR_CODE('B','-','>','U');const UInt32 kCode_UB	= FOUR_CHAR_CODE('U','-','>','B');const UInt32 kCode_Unic	= FOUR_CHAR_CODE('U','n','i','c');const UInt32 kCode_NFCf	= FOUR_CHAR_CODE('N','F','C','f');const UInt32 kCode_NFCr	= FOUR_CHAR_CODE('N','F','C','r');const UInt32 kCode_NFC	= FOUR_CHAR_CODE('N','F','C',' ');const UInt32 kCode_NFDf	= FOUR_CHAR_CODE('N','F','D','f');const UInt32 kCode_NFDr	= FOUR_CHAR_CODE('N','F','D','r');const UInt32 kCode_NFD	= FOUR_CHAR_CODE('N','F','D',' ');Compiler::KeywordCompiler::keywords[] = {	{ "Pass",				tok_Pass,		0						},	{ "Byte",				tok_PassType,	kCode_Byte				},	{ "Byte_Unicode",		tok_PassType,	kCode_BU					},	{ "Unicode_Byte",		tok_PassType,	kCode_UB					},	{ "Unicode",			tok_PassType,	kCode_Unic				},	{ "NFC_fwd",			tok_PassType,	kCode_NFCf				},	{ "NFC_rev",			tok_PassType,	kCode_NFCr				},	{ "NFC",				tok_PassType,	kCode_NFC				},	{ "NFD_fwd",			tok_PassType,	kCode_NFDf				},	{ "NFD_rev",			tok_PassType,	kCode_NFDr				},	{ "NFD",				tok_PassType,	kCode_NFD				},	{ "Class",				tok_Class,		0						},	{ "ByteClass",			tok_Class,		'B'						},	{ "UniClass",			tok_Class,		'U'						},	{ "ByteDefault",		tok_Default,	'B'						},	{ "UniDefault",			tok_Default,	'U'						},	{ "EncodingName",		tok_Name,		kNameID_LHS_Name		},	{ "DescriptiveName",	tok_Name,		kNameID_LHS_Description	},	{ "Name",				tok_Name,		0xffffffff				},	{ "LHSName",			tok_Name,		kNameID_LHS_Name		},	{ "LHSDescription",		tok_Name,		kNameID_LHS_Description	},	{ "RHSName",			tok_Name,		kNameID_RHS_Name		},	{ "RHSDescription",		tok_Name,		kNameID_RHS_Description	},	{ "Version",			tok_Name,		kNameID_Version			},	{ "Contact",			tok_Name,		kNameID_Contact			},	{ "RegistrationAuthority",	tok_Name,	kNameID_RegAuthority	},	{ "RegistrationName",	tok_Name,		kNameID_RegName			},	{ "Copyright",			tok_Name,		kNameID_Copyright		},	{ "LHSFlags",			tok_Flags,		'S'						},	{ "RHSFlags",			tok_Flags,		'T'						},	{ "ExpectsNFC",			tok_FlagValue,	kFlags_ExpectsNFC		},	{ "ExpectsNFD",			tok_FlagValue,	kFlags_ExpectsNFD		},	{ "GeneratesNFC",		tok_FlagValue,	kFlags_GeneratesNFC		},	{ "GeneratesNFD",		tok_FlagValue,	kFlags_GeneratesNFD		},	{ "VisualOrder",		tok_FlagValue,	kFlags_VisualOrder		},	{ "Define",				tok_Define,		0						},	{ 0,					tok_Identifier,	0						}};UInt32WINAPITECkit_GetCompilerVersion(){	return kCurrentTECkitVersion;}TECkit_StatusWINAPITECkit_Compile(char* txt, UInt32 len, Byte doCompression, TECkit_ErrorFn errFunc, void* userData, Byte** outTable, UInt32* outLen){	TECkit_Status	result = kStatus_CompilationFailed;	try {		Compiler*	cmp = new Compiler(txt, len, kForm_Unspecified, (bool)doCompression, false, errFunc, userData);		cmp->GetCompiledTable(*outTable, *outLen);		if (*outTable == 0)			result = kStatus_CompilationFailed;		else {			cmp->DetachCompiledTable();			result = kStatus_NoError;		}		delete cmp;	}	catch (...) {		result = kStatus_Exception;	}	return result;}TECkit_StatusWINAPITECkit_CompileOpt(char* txt, UInt32 len, TECkit_ErrorFn errFunc, void* userData, Byte** outTable, UInt32* outLen, UInt32 opts){	TECkit_Status	result = kStatus_CompilationFailed;	try {		Compiler*	cmp = new Compiler(txt, len, (opts & kCompilerOpts_FormMask),			(opts & kCompilerOpts_Compress) != 0, (opts & kCompilerOpts_XML) != 0, errFunc, userData);		cmp->GetCompiledTable(*outTable, *outLen);		if (*outTable == 0)			result = kStatus_CompilationFailed;		else {			cmp->DetachCompiledTable();			result = kStatus_NoError;		}		delete cmp;	}	catch (...) {		result = kStatus_Exception;	}	return result;}voidWINAPITECkit_DisposeCompiled(Byte* table){	if (table != 0)		free(table);}char*WINAPITECkit_GetUnicodeName(UInt32 usv){	const CharName	*c = &gUnicodeNames[0];	while (c->name != 0)		if (c->usv == usv)			return (char*)c->name;		else			++c;	return NULL;}char*WINAPITECkit_GetTECkitName(UInt32 usv){	static char	buffer[256];	const char*	name = TECkit_GetUnicodeName(usv);	if (name == NULL)		sprintf(buffer, "U+%04X", usv);	else {		char* cp = &buffer[0];		while (*name && (cp - buffer < 255)) {			if ((*name < '0') || (*name > '9' && *name < 'A') || (*name > 'Z'))				*cp++ = '_';			else				*cp++ = *name | 0x20;			++name;		}		*cp = 0;	}	return buffer;}static intunicodeNameCompare(const char* uniName, const char* idStr, UInt32 len){ // idStr could be either a "real" unicode name or a teckit identifier  // when this is used by the TECkit_GetUnicodeValue API	while (*uniName || len != 0) {		if (len == 0)			return 1;		char	u = *uniName++;		char	i = *idStr++;		--len;		if ((i >= 'a') && (i <= 'z'))			i &= ~0x20;		if (u == i)			continue;		if ((u < '0') || (u > '9' && u < 'A') || (u > 'Z'))			u = '_';		if (u == i)			continue;		return u < i ? -1 : 1;	}	return 0;}intWINAPITECkit_GetUnicodeValue(char* name){	const CharName	*c = &gUnicodeNames[0];	int	len = strlen(name);	while (c->name != 0)		if (unicodeNameCompare(c->name, name, len) == 0)			return c->usv;		else			++c;	return -1;}static inline UInt8READ(const UInt8 p){	return p;}static inline UInt16READ(const UInt16 p){#ifdef WORDS_BIGENDIAN	return p;#else	return (p >> 8) + (p << 8);#endif}static inline UInt32READ(const UInt32 p){#ifdef WORDS_BIGENDIAN	return p;#else	return (p >> 24) + ((p >> 8) & 0x0000ff00) + ((p << 8) & 0x00ff0000) + (p << 24);#endif}template<class T>inline static voidWRITE(T& t, UInt32 v){	t = READ(T(v));}voidCompiler::appendToTable(string& s, const char* ptr, UInt32 len){#ifdef WORDS_BIGENDIAN	s.append(ptr, len);#else	ptr += len;	while (len-- > 0)		s.append(1, *--ptr);#endif}static inline boolisIDstart(char c){	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';}static inline boolisIDcont(char c){	return isIDstart(c) || (c >= '0' && c <= '9');}static boolstrmatch(const char* str, const char* txt, UInt32 len){	while (*str || len != 0) {		if (len == 0)			return false;		if ((*str++ | 0x20) != (*txt++ | 0x20))			return false;		--len;	}	return true;}static const char*getClassName(const map<string,UInt32>& nameMap, UInt32 index){	for (map<string,UInt32>::const_iterator i = nameMap.begin(); i != nameMap.end(); ++i) {		if (i->second == index) {			return i->first.c_str();		}	}	return "[UNKNOWN]";}static const char*asHex(UInt32 val, short digits){	static char	str[16];	sprintf(str, "%0*X", digits, val);	return str;}static const char*asDec(UInt32 val){	static char	str[16];	sprintf(str, "%d", val);	return str;}voidCompiler::xmlOut(const char* s){	xmlRepresentation += s;}voidCompiler::xmlOut(const string& s){	xmlRepresentation += s;}voidCompiler::xmlOut(char c){	xmlRepresentation += c;}stringCompiler::getContextID(const vector<Item>& ctx, bool isUnicode){	string	contextString = xmlString(ctx.begin(), ctx.end(), isUnicode);	string	contextID = currentPass.xmlContexts[contextString];	if (contextID.length() == 0) {		contextID = isUnicode ? "uctx_" : "bctx_";		contextID += asDec(currentPass.xmlContexts.size());		currentPass.xmlContexts[contextString] = contextID;	}	return contextID;}stringCompiler::xmlString(vector<Item>::const_iterator b, vector<Item>::const_iterator e, bool isUnicode){	string	rval;	if (b == e)		return rval;	for (vector<Item>::const_iterator i = b; i != e; ++i) {		switch (i->type) {			case 0:				rval += "<ch n=\"";				rval += asHex(i->val, isUnicode ? 4 : 2);				rval += "\"";				break;			case kMatchElem_Type_EOS:				rval += "<eot";				break;			case kMatchElem_Type_ANY:				rval += "<any";				break;			case kMatchElem_Type_BGroup:				{					vector<Item>::const_iterator j = i;					int nesting = 0;					bool	alt = false;					string	groupStr;					++i;					while (++j != e) {						if (j->type == kMatchElem_Type_BGroup)							++nesting;						else if (j->type == kMatchElem_Type_EGroup) {							if (nesting == 0) {								if (alt && i < j - 1)									groupStr += "<group>\n";								groupStr += xmlString(i, j, isUnicode);								if (alt && i < j - 1)									groupStr += "</group>\n";								break;							}							else								--nesting;						}						else if (j->type == kMatchElem_Type_OR && nesting == 0) {							if (i < j - 1)								groupStr += "<group>\n";							groupStr += xmlString(i, j, isUnicode);							if (i < j - 1)								groupStr += "</group>\n";							i = j + 1;							alt = true;						}					}					i = j;					rval += "<group";					if (alt)						rval += " alt=\"1\"";					if ((i->repeatMin != 1) && (i->repeatMin != 255)) {						rval += " min=\"";						rval += asDec(i->repeatMin);						rval += "\"";					}					if ((i->repeatMax != 1) && (i->repeatMax != 255)) {						rval += " max=\"";						rval += asDec(i->repeatMax);						rval += "\"";					}					if (i->tag.length() > 0) {						if (i->type != kMatchElem_Type_Copy) {							rval += " id=\"";							rval += i->tag;							rval += "\"";						}					}					rval += ">\n";					rval += groupStr;					rval += "</group>\n";					continue;				}				break;			case kMatchElem_Type_OR:				rval += "<OR/>\n";				continue;				break;			case kMatchElem_Type_EGroup:				rval += "<END-GROUP/>\n";				continue;				break;			case kMatchElem_Type_Class:				{					rval += "<class-ref name=\"";					const map<string,UInt32>&	classes = isUnicode ? currentPass.uniClassNames : currentPass.byteClassNames;					rval += isUnicode ? "u_" : "b_";					rval += getClassName(classes, i->val);					rval += "\"";				}				break;			case kMatchElem_Type_Copy:				rval += "<copy-ref id=\"";				rval += i->tag;				rval += "\"";				break;			default:				rval += "<UNKNOWN type=\"";				rval += asHex(i->type, 1);				break;		}		if (i->negate)			rval += " neg=\"1\"";
12 3 4 5 下一页
💿 文件大小 7978 K
👤 上传用户 oujk123
📂 所属分类其他行业
🏷️ 相关标签

#text #middleware-layer #information #retrieval
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -