📄 compiler.cpp

📁 Emdros is a text database middleware-layer aimed at storage and retrieval of "text plus information
💻 CPP
📖 第 1 页 / 共 5 页
字号:
		} else if (c < 0x800) {		bytesToWrite = 2;		} else if (c < 0x10000) {	bytesToWrite = 3;		} else if (c < 0x200000) {	bytesToWrite = 4;		} else {					bytesToWrite = 2;									c = 0x0000fffd;		};		rval.append((size_t)bytesToWrite, 0);		int index = rval.length();		switch (bytesToWrite) {	/* note: code falls through cases! */			case 4:	rval[--index] = (c | byteMark) & byteMask; c >>= 6;			case 3:	rval[--index] = (c | byteMark) & byteMask; c >>= 6;			case 2:	rval[--index] = (c | byteMark) & byteMask; c >>= 6;			case 1:	rval[--index] =  c | firstByteMark[bytesToWrite];		};	}	return rval;}voidCompiler::ReadNameString(UInt16 nameID){	if (ExpectToken(tok_String, "expected STRING after name keyword")) {		if (inputForm == kForm_Bytes) {			names[nameID].erase(names[nameID].begin(), names[nameID].end());			for (string32::const_iterator i = tok.strval.begin(); i != tok.strval.end(); ++i)				names[nameID].append(1, *i);		}		else			names[nameID] = asUTF8(tok.strval);		ExpectToken(tok_Newline, "junk at end of line");	}}voidCompiler::FinishPass(){	if (currentPass.passType == 0)		return;	if ((currentPass.passType & 0xFFFF0000) == (FOUR_CHAR_CODE('N','F','_','_') & 0xFFFF0000)) {		while (errorCount == 0) {			if (fwdTables.size() == 0)				lhsFlags |= kFlags_Unicode;			else {				if ((rhsFlags & kFlags_Unicode) == 0) {					Error("normalization only supported in Unicode space");					break;				}			}			rhsFlags |= kFlags_Unicode;			string	normTable((currentPass.passType & 0x0000FF00) == (FOUR_CHAR_CODE('_','_','C','_') & 0x0000FF00) 								? "NFC " : "NFD ");			if ((currentPass.passType & 0x000000FF) != 'r')				fwdTables.push_back(normTable);			if ((currentPass.passType & 0x000000FF) != 'f')				revTables.push_back(normTable);			if (generateXML) {				xmlOut("<pass lhs=\"unicode\" rhs=\"unicode\" line=\"");				xmlOut(asDec(currentPass.startingLine));				xmlOut("\">\n");				xmlOut("<normalize form=\"");				xmlOut(normTable[2]);				if ((currentPass.passType & 0x000000FF) == 'f')					xmlOut(" dir=\"fwd\"");				else if ((currentPass.passType & 0x000000FF) == 'r')					xmlOut(" dir=\"rev\"");				xmlOut("\">\n");				xmlOut("</pass>\n");			}			break;		}	}	else {		while (errorCount == 0) {			// not really a loop; just so we can use 'break' to exit early			bool	sourceUni = (currentPass.passType == kCode_UB) || (currentPass.passType == kCode_Unic);			bool	targetUni = (currentPass.passType == kCode_BU) || (currentPass.passType == kCode_Unic);			if (generateXML) {				// pass header				xmlOut("<pass lhs=\"");				xmlOut(sourceUni ? "unicode" : "bytes");				xmlOut("\" rhs=\"");				xmlOut(targetUni ? "unicode" : "bytes");				if (sourceUni != targetUni) {					xmlOut("\" lhsDefault=\"");					xmlOut(sourceUni ? asHex(currentPass.uniDefault, 4) : asHex(currentPass.byteDefault, 2));					xmlOut("\" rhsDefault=\"");					xmlOut(targetUni ? asHex(currentPass.uniDefault, 4) : asHex(currentPass.byteDefault, 2));				}				xmlOut("\" line=\"");				xmlOut(asDec(currentPass.startingLine));				xmlOut("\">\n");								// class definitions				if (currentPass.byteClassMembers.size() > 0 || currentPass.uniClassMembers.size() > 0) {					xmlOut("<classes>\n");					unsigned int i;					for (i = 0; i < currentPass.byteClassMembers.size(); ++i) {						xmlOut("<class size=\"bytes\" name=\"b_");						xmlOut(getClassName(currentPass.byteClassNames, i));						xmlOut("\" line=\"");						xmlOut(asDec(currentPass.byteClassLines[i]));						xmlOut("\">");						for (Class::const_iterator ci = currentPass.byteClassMembers[i].begin(); ci != currentPass.byteClassMembers[i].end(); ++ci) {							xmlOut(ci == currentPass.byteClassMembers[i].begin() ? "\n" : " ");							xmlOut(asHex(*ci, 2));						}						xmlOut("\n</class>\n");					}					for (i = 0; i < currentPass.uniClassMembers.size(); ++i) {						xmlOut("<class size=\"unicode\" name=\"u_");						xmlOut(getClassName(currentPass.uniClassNames, i));						xmlOut("\" line=\"");						xmlOut(asDec(currentPass.uniClassLines[i]));						xmlOut("\">");						for (Class::const_iterator ci = currentPass.uniClassMembers[i].begin(); ci != currentPass.uniClassMembers[i].end(); ++ci) {							xmlOut(ci == currentPass.uniClassMembers[i].begin() ? "\n" : " ");							xmlOut(asHex(*ci, 4));						}						xmlOut("\n</class>\n");					}					xmlOut("</classes>\n");				}								if (currentPass.xmlContexts.size() > 0) {					xmlOut("<contexts>\n");					for (map<string,string>::const_iterator i = currentPass.xmlContexts.begin();							i != currentPass.xmlContexts.end(); ++i) {						xmlOut("<context id=\"");						xmlOut(i->second);						xmlOut("\">");						xmlOut(i->first);						xmlOut("</context>\n");					}					xmlOut("</contexts>\n");				}								xmlOut("<assignments>\n");				for (vector<string>::const_iterator i = currentPass.xmlRules.begin();						i != currentPass.xmlRules.end(); ++i) {					xmlOut(*i);				}				xmlOut("</assignments>\n");								// end pass				xmlOut("</pass>\n");			}			if (fwdTables.size() == 0) {				if (sourceUni)					lhsFlags |= kFlags_Unicode;			}			else {				if (sourceUni != ((rhsFlags & kFlags_Unicode) != 0)) {					Error("code space mismatch");					break;				}			}			rhsFlags &= ~kFlags_Unicode;			if (targetUni)				rhsFlags |= kFlags_Unicode;			// deal with COPY on LHS, and set up class/copy replacement index fields			associateItems(currentPass.fwdRules, sourceUni, targetUni);			if (errorCount > 0)				break;			setGroupPointers(currentPass.fwdRules);			// sort rules by length (also propagates repeat counts from EGroup back to BGroup items)			sortRules(currentPass.fwdRules);			if (errorCount > 0)				break;			// build the forward table			fwdTables.push_back(string());			buildTable(currentPass.fwdRules, sourceUni, targetUni, fwdTables.back());			buildVars.clear();			if (errorCount > 0)				break;						// build the reverse table			associateItems(currentPass.revRules, targetUni, sourceUni);			if (errorCount > 0)				break;			setGroupPointers(currentPass.revRules);			sortRules(currentPass.revRules);			if (errorCount > 0)				break;			revTables.push_back(string());			buildTable(currentPass.revRules, targetUni, sourceUni, revTables.back());			buildVars.clear();			break;		}	}	currentPass.clear();	currentPass.setLineNo(lineNumber);}voidCompiler::SkipSpaces(void){	while (textPtr < textEnd) {		currCh = getChar();		if (currCh != ' ' && currCh != '\t') {			ungetChar(currCh);			break;		}	}}Compiler::tokenTypeCompiler::IDlookup(const char* str, UInt32 len){	const Keyword	*k = &keywords[0];	while (k->keyword != 0)		if (strmatch(k->keyword, str, len)) {			tok.val = k->refCon;			return k->token;		}		else			++k;	// try for a macro	map<string,tokListT>::const_iterator	i = defines.find(string(str, len));	if (i != defines.end()) {		defIter = i->second.begin();		defEnd = i->second.end();		tok = *defIter;		defIter++;		return tok.type;	}	// didn't find the identifier as a keyword; try as a Unicode char name	// NOTE: the names are now sorted (by Unicode name), so we could use a binary	//  search here if anyone complains about compilation time when using names :)	const CharName	*c = &gUnicodeNames[0];	while (c->name != 0)		if (unicodeNameCompare(c->name, str, len) == 0) {			tok.val = c->usv;			return tok_USV;		}		else			++c;#ifdef __MWERKS__	tok.strval.clear();#else	tok.strval.erase(tok.strval.begin(), tok.strval.end());#endif	while (len-- > 0)		tok.strval.append(1, *str++);	return tok_Identifier;}UInt32Compiler::getChar(){	UInt32	rval = 0;	if (ungotten != kInvalidChar) {		rval = ungotten;		ungotten = kInvalidChar;		return rval;	}#define CHECK_AVAIL(x)				\	if (textPtr + (x) > textEnd) {	\			textPtr = textEnd;		\			return kInvalidChar;	\	}		switch (inputForm) {		case kForm_Bytes:			rval = *textPtr++;			break;					case kForm_UTF8:			{				UInt16 extraBytes = bytesFromUTF8[*textPtr];				CHECK_AVAIL(extraBytes + 1);				switch (extraBytes) {	// note: code falls through cases!					case 5:	rval += *textPtr++; rval <<= 6;					case 4:	rval += *textPtr++; rval <<= 6;					case 3:	rval += *textPtr++; rval <<= 6;					case 2:	rval += *textPtr++; rval <<= 6;					case 1:	rval += *textPtr++; rval <<= 6;					case 0:	rval += *textPtr++;				};				rval -= offsetsFromUTF8[extraBytes];			}			break;		case kForm_UTF16BE:			CHECK_AVAIL(2);			rval = *textPtr++ << 8;			rval += *textPtr++;			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {				// check that 2 more bytes are available				CHECK_AVAIL(2);				UInt32	low = *textPtr++ << 8;				low += *textPtr++;				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;			}			break;		case kForm_UTF16LE:			CHECK_AVAIL(2);			rval = *textPtr++;			rval += *textPtr++ << 8;			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {				CHECK_AVAIL(2);				UInt32	low = *textPtr++;				low += *textPtr++ << 8;				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;			}			break;		case kForm_UTF32BE:			CHECK_AVAIL(4);			rval = *textPtr++ << 24;			rval += *textPtr++ << 16;			rval += *textPtr++ << 8;			rval += *textPtr++;			break;		case kForm_UTF32LE:			CHECK_AVAIL(4);			rval = *textPtr++;			rval += *textPtr++ << 8;			rval += *textPtr++ << 16;			rval += *textPtr++ << 24;			break;	}	return rval;}voidCompiler::ungetChar(UInt32 c){	ungotten = c;}boolCompiler::GetNextToken(){	UInt32	currCh;		if (defIter != defEnd) {		tok = *defIter;		defIter++;		return true;	}	if (textPtr == textEnd) {		++textPtr;		tok.type = tok_Newline;		return true;		}	if (textPtr >= textEnd)		return false;	while (true) {		SkipSpaces();				tokStart = textPtr;				if (textPtr == textEnd) {			++textPtr;			tok.type = tok_Newline;			++lineNumber;			return true;		}				if (textPtr > textEnd)			return false;				currCh = getChar();		switch (currCh) {			case '\r':				if (textPtr < textEnd) {					currCh = getChar();					if (currCh != '\n')						ungetChar(currCh);				}				tok.type = tok_Newline;				++lineNumber;				return true;				case '\n':				if (textPtr < textEnd) {					currCh = getChar();					if (currCh != '\r')						ungetChar(currCh);				}				tok.type = tok_Newline;				++lineNumber;				return true;				case '\\':				if (textPtr < textEnd) {					currCh = getChar();					if (currCh == '\r' || currCh == '\n') {						if (textPtr < textEnd) {							UInt32 nextCh = getChar();							if (!((currCh == '\r' && nextCh == '\n') || (currCh == '\n' && nextCh == '\r')))								ungetChar(nextCh);						}						++lineNumber;						continue;					}					ungetChar(currCh);				}				goto DEFAULT;				case '"':			case '\'':				{					UInt32	delimiter = currCh;#ifdef __MWERKS__					tok.strval.clear();#else					tok.strval.erase(tok.strval.begin(), tok.strval.end());#endif					while ((textPtr < textEnd) && ((currCh = getChar()) != delimiter) && (currCh != '\r') && (currCh != '\n'))						tok.strval.append(1, currCh);					tok.type = tok_String;					if (currCh == '\r' || currCh == '\n')						ungetChar(currCh);				}				return true;				case '^':			case '(':			case ')':			case '[':			case ']':			case '{':			case '}':			case ',':			case '+':			case '*':			case '?':			case '>':			case '#':			case '|':			case '/':			case '=':			case '@':				tok.type = (tokenType)currCh;				return true;				case '<':				tok.type = (tokenType)'<';				if (textPtr < textEnd)					if ((currCh = getChar()) == '>')						tok.type = tok_Map;					else						ungetChar(currCh);				return true;				case '.':				tok.type = (tokenType)'.';				if (textPtr < textEnd)					if ((currCh = getChar()) == '.')						tok.type = tok_Ellipsis;					else						ungetChar(currCh);				return true;						case '_':				if (textPtr < textEnd) {					currCh = getChar();					ungetChar(currCh);					if (isIDcont(currCh)) {						currCh = '_';						goto DEFAULT;					}				}				tok.type = (tokenType)'_';				return true;						case '0':				if (textPtr < textEnd) {					currCh = getChar();					if (currCh == 'x' || currCh == 'X') {						tok.type = tok_Number;						tok.val = 0;						while (textPtr < textEnd) {							currCh = getChar();							if (currCh >= '0' && currCh <= '9')								tok.val = tok.val * 16 + currCh - '0';							else if (currCh >= 'a' && currCh <= 'f')								tok.val = tok.val * 16 + currCh - 'a' + 10;							else if (currCh >= 'A' && currCh <= 'F')								tok.val = tok.val * 16 + currCh - 'A' + 10;							else {
💿 文件大小 7978 K
👤 上传用户 oujk123
📂 所属分类其他行业
🏷️ 相关标签

#text #middleware-layer #information #retrieval
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -