📄 compiler.cpp
字号:
/*------------------------------------------------------------------------Copyright (C) 2002-2004 SIL International. All rights reserved.Distributable under the terms of either the Common Public License or theGNU Lesser General Public License, as specified in the LICENSING.txt file.File: Compiler.cppResponsibility: Jonathan KewLast reviewed: Not yet.Description: Implements the TECkit mapping compiler.-------------------------------------------------------------------------*//* 2006-06-19 jk added new APIs to look up Unicode names 2006-01-12 jk removed multi-char constants, use FOUR_CHAR_CODE to define UInt32 values instead (no functional change, just to avoid compiler warnings) 2005-07-07 jk 2.1.5 changed to use WORDS_BIGENDIAN rather than TARGET_RT_BIG_ENDIAN 2005-06-20 jk 2.1.4 added lhsDefault/rhsDefault attributes to <pass> elem in xml output 23-May-2005 changes for 64-bit compilation, from Ulrik P 21-May-2005 changes based on Ulrik Petersen's patch for MS VC++ 6 2004-11-11 jk 2.1.3 added support for XML export 2004-07-21 jk 2.1.2 removed trailing spaces from 2 names in UnicodeNames.cpp 2004-06-16 jk 2.1.1 fixed bug of ignoring char after '_' 2004-03-12 jk 2.1 updated for version 2.1 with ...Opt APIs modified compiler to accept Unicode source text*/#include "Compiler.h"#include <iostream>#include <iomanip>#include <algorithm>#include <cstring>#ifndef NO_ZLIB#include "zlib.h"#endifconst UInt32 kInvalidChar = 0xfffffffdUL;static UInt32offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL};static UInt8bytesFromUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};static UInt8firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};const int halfShift = 10;const UInt32 halfBase = 0x0010000UL;const UInt32 halfMask = 0x3FFUL;const UInt32 kSurrogateHighStart = 0xD800UL;const UInt32 kSurrogateHighEnd = 0xDBFFUL;const UInt32 kSurrogateLowStart = 0xDC00UL;const UInt32 kSurrogateLowEnd = 0xDFFFUL;const UInt32 byteMask = 0x000000BFUL;const UInt32 byteMark = 0x00000080UL;#define FOUR_CHAR_CODE(a,b,c,d) (UInt32)((a << 24) + (b << 16) + (c << 8) + d)const UInt32 kCode_Byte = FOUR_CHAR_CODE('B','y','t','e');const UInt32 kCode_BU = FOUR_CHAR_CODE('B','-','>','U');const UInt32 kCode_UB = FOUR_CHAR_CODE('U','-','>','B');const UInt32 kCode_Unic = FOUR_CHAR_CODE('U','n','i','c');const UInt32 kCode_NFCf = FOUR_CHAR_CODE('N','F','C','f');const UInt32 kCode_NFCr = FOUR_CHAR_CODE('N','F','C','r');const UInt32 kCode_NFC = FOUR_CHAR_CODE('N','F','C',' ');const UInt32 kCode_NFDf = FOUR_CHAR_CODE('N','F','D','f');const UInt32 kCode_NFDr = FOUR_CHAR_CODE('N','F','D','r');const UInt32 kCode_NFD = FOUR_CHAR_CODE('N','F','D',' ');Compiler::KeywordCompiler::keywords[] = { { "Pass", tok_Pass, 0 }, { "Byte", tok_PassType, kCode_Byte }, { "Byte_Unicode", tok_PassType, kCode_BU }, { "Unicode_Byte", tok_PassType, kCode_UB }, { "Unicode", tok_PassType, kCode_Unic }, { "NFC_fwd", tok_PassType, kCode_NFCf }, { "NFC_rev", tok_PassType, kCode_NFCr }, { "NFC", tok_PassType, kCode_NFC }, { "NFD_fwd", tok_PassType, kCode_NFDf }, { "NFD_rev", tok_PassType, kCode_NFDr }, { "NFD", tok_PassType, kCode_NFD }, { "Class", tok_Class, 0 }, { "ByteClass", tok_Class, 'B' }, { "UniClass", tok_Class, 'U' }, { "ByteDefault", tok_Default, 'B' }, { "UniDefault", tok_Default, 'U' }, { "EncodingName", tok_Name, kNameID_LHS_Name }, { "DescriptiveName", tok_Name, kNameID_LHS_Description }, { "Name", tok_Name, 0xffffffff }, { "LHSName", tok_Name, kNameID_LHS_Name }, { "LHSDescription", tok_Name, kNameID_LHS_Description }, { "RHSName", tok_Name, kNameID_RHS_Name }, { "RHSDescription", tok_Name, kNameID_RHS_Description }, { "Version", tok_Name, kNameID_Version }, { "Contact", tok_Name, kNameID_Contact }, { "RegistrationAuthority", tok_Name, kNameID_RegAuthority }, { "RegistrationName", tok_Name, kNameID_RegName }, { "Copyright", tok_Name, kNameID_Copyright }, { "LHSFlags", tok_Flags, 'S' }, { "RHSFlags", tok_Flags, 'T' }, { "ExpectsNFC", tok_FlagValue, kFlags_ExpectsNFC }, { "ExpectsNFD", tok_FlagValue, kFlags_ExpectsNFD }, { "GeneratesNFC", tok_FlagValue, kFlags_GeneratesNFC }, { "GeneratesNFD", tok_FlagValue, kFlags_GeneratesNFD }, { "VisualOrder", tok_FlagValue, kFlags_VisualOrder }, { "Define", tok_Define, 0 }, { 0, tok_Identifier, 0 }};UInt32WINAPITECkit_GetCompilerVersion(){ return kCurrentTECkitVersion;}TECkit_StatusWINAPITECkit_Compile(char* txt, UInt32 len, Byte doCompression, TECkit_ErrorFn errFunc, void* userData, Byte** outTable, UInt32* outLen){ TECkit_Status result = kStatus_CompilationFailed; try { Compiler* cmp = new Compiler(txt, len, kForm_Unspecified, (bool)doCompression, false, errFunc, userData); cmp->GetCompiledTable(*outTable, *outLen); if (*outTable == 0) result = kStatus_CompilationFailed; else { cmp->DetachCompiledTable(); result = kStatus_NoError; } delete cmp; } catch (...) { result = kStatus_Exception; } return result;}TECkit_StatusWINAPITECkit_CompileOpt(char* txt, UInt32 len, TECkit_ErrorFn errFunc, void* userData, Byte** outTable, UInt32* outLen, UInt32 opts){ TECkit_Status result = kStatus_CompilationFailed; try { Compiler* cmp = new Compiler(txt, len, (opts & kCompilerOpts_FormMask), (opts & kCompilerOpts_Compress) != 0, (opts & kCompilerOpts_XML) != 0, errFunc, userData); cmp->GetCompiledTable(*outTable, *outLen); if (*outTable == 0) result = kStatus_CompilationFailed; else { cmp->DetachCompiledTable(); result = kStatus_NoError; } delete cmp; } catch (...) { result = kStatus_Exception; } return result;}voidWINAPITECkit_DisposeCompiled(Byte* table){ if (table != 0) free(table);}char*WINAPITECkit_GetUnicodeName(UInt32 usv){ const CharName *c = &gUnicodeNames[0]; while (c->name != 0) if (c->usv == usv) return (char*)c->name; else ++c; return NULL;}char*WINAPITECkit_GetTECkitName(UInt32 usv){ static char buffer[256]; const char* name = TECkit_GetUnicodeName(usv); if (name == NULL) sprintf(buffer, "U+%04X", usv); else { char* cp = &buffer[0]; while (*name && (cp - buffer < 255)) { if ((*name < '0') || (*name > '9' && *name < 'A') || (*name > 'Z')) *cp++ = '_'; else *cp++ = *name | 0x20; ++name; } *cp = 0; } return buffer;}static intunicodeNameCompare(const char* uniName, const char* idStr, UInt32 len){ // idStr could be either a "real" unicode name or a teckit identifier // when this is used by the TECkit_GetUnicodeValue API while (*uniName || len != 0) { if (len == 0) return 1; char u = *uniName++; char i = *idStr++; --len; if ((i >= 'a') && (i <= 'z')) i &= ~0x20; if (u == i) continue; if ((u < '0') || (u > '9' && u < 'A') || (u > 'Z')) u = '_'; if (u == i) continue; return u < i ? -1 : 1; } return 0;}intWINAPITECkit_GetUnicodeValue(char* name){ const CharName *c = &gUnicodeNames[0]; int len = strlen(name); while (c->name != 0) if (unicodeNameCompare(c->name, name, len) == 0) return c->usv; else ++c; return -1;}static inline UInt8READ(const UInt8 p){ return p;}static inline UInt16READ(const UInt16 p){#ifdef WORDS_BIGENDIAN return p;#else return (p >> 8) + (p << 8);#endif}static inline UInt32READ(const UInt32 p){#ifdef WORDS_BIGENDIAN return p;#else return (p >> 24) + ((p >> 8) & 0x0000ff00) + ((p << 8) & 0x00ff0000) + (p << 24);#endif}template<class T>inline static voidWRITE(T& t, UInt32 v){ t = READ(T(v));}voidCompiler::appendToTable(string& s, const char* ptr, UInt32 len){#ifdef WORDS_BIGENDIAN s.append(ptr, len);#else ptr += len; while (len-- > 0) s.append(1, *--ptr);#endif}static inline boolisIDstart(char c){ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';}static inline boolisIDcont(char c){ return isIDstart(c) || (c >= '0' && c <= '9');}static boolstrmatch(const char* str, const char* txt, UInt32 len){ while (*str || len != 0) { if (len == 0) return false; if ((*str++ | 0x20) != (*txt++ | 0x20)) return false; --len; } return true;}static const char*getClassName(const map<string,UInt32>& nameMap, UInt32 index){ for (map<string,UInt32>::const_iterator i = nameMap.begin(); i != nameMap.end(); ++i) { if (i->second == index) { return i->first.c_str(); } } return "[UNKNOWN]";}static const char*asHex(UInt32 val, short digits){ static char str[16]; sprintf(str, "%0*X", digits, val); return str;}static const char*asDec(UInt32 val){ static char str[16]; sprintf(str, "%d", val); return str;}voidCompiler::xmlOut(const char* s){ xmlRepresentation += s;}voidCompiler::xmlOut(const string& s){ xmlRepresentation += s;}voidCompiler::xmlOut(char c){ xmlRepresentation += c;}stringCompiler::getContextID(const vector<Item>& ctx, bool isUnicode){ string contextString = xmlString(ctx.begin(), ctx.end(), isUnicode); string contextID = currentPass.xmlContexts[contextString]; if (contextID.length() == 0) { contextID = isUnicode ? "uctx_" : "bctx_"; contextID += asDec(currentPass.xmlContexts.size()); currentPass.xmlContexts[contextString] = contextID; } return contextID;}stringCompiler::xmlString(vector<Item>::const_iterator b, vector<Item>::const_iterator e, bool isUnicode){ string rval; if (b == e) return rval; for (vector<Item>::const_iterator i = b; i != e; ++i) { switch (i->type) { case 0: rval += "<ch n=\""; rval += asHex(i->val, isUnicode ? 4 : 2); rval += "\""; break; case kMatchElem_Type_EOS: rval += "<eot"; break; case kMatchElem_Type_ANY: rval += "<any"; break; case kMatchElem_Type_BGroup: { vector<Item>::const_iterator j = i; int nesting = 0; bool alt = false; string groupStr; ++i; while (++j != e) { if (j->type == kMatchElem_Type_BGroup) ++nesting; else if (j->type == kMatchElem_Type_EGroup) { if (nesting == 0) { if (alt && i < j - 1) groupStr += "<group>\n"; groupStr += xmlString(i, j, isUnicode); if (alt && i < j - 1) groupStr += "</group>\n"; break; } else --nesting; } else if (j->type == kMatchElem_Type_OR && nesting == 0) { if (i < j - 1) groupStr += "<group>\n"; groupStr += xmlString(i, j, isUnicode); if (i < j - 1) groupStr += "</group>\n"; i = j + 1; alt = true; } } i = j; rval += "<group"; if (alt) rval += " alt=\"1\""; if ((i->repeatMin != 1) && (i->repeatMin != 255)) { rval += " min=\""; rval += asDec(i->repeatMin); rval += "\""; } if ((i->repeatMax != 1) && (i->repeatMax != 255)) { rval += " max=\""; rval += asDec(i->repeatMax); rval += "\""; } if (i->tag.length() > 0) { if (i->type != kMatchElem_Type_Copy) { rval += " id=\""; rval += i->tag; rval += "\""; } } rval += ">\n"; rval += groupStr; rval += "</group>\n"; continue; } break; case kMatchElem_Type_OR: rval += "<OR/>\n"; continue; break; case kMatchElem_Type_EGroup: rval += "<END-GROUP/>\n"; continue; break; case kMatchElem_Type_Class: { rval += "<class-ref name=\""; const map<string,UInt32>& classes = isUnicode ? currentPass.uniClassNames : currentPass.byteClassNames; rval += isUnicode ? "u_" : "b_"; rval += getClassName(classes, i->val); rval += "\""; } break; case kMatchElem_Type_Copy: rval += "<copy-ref id=\""; rval += i->tag; rval += "\""; break; default: rval += "<UNKNOWN type=\""; rval += asHex(i->type, 1); break; } if (i->negate) rval += " neg=\"1\"";
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -