📄 compiler.cpp

📁 Emdros is a text database middleware-layer aimed at storage and retrieval of "text plus information
💻 CPP
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
								ungetChar(currCh);								break;							}						}						return true;					}					ungetChar(currCh);					currCh = '0';				}				// else fall through							case '1':			case '2':			case '3':			case '4':			case '5':			case '6':			case '7':			case '8':			case '9':				tok.type = tok_Number;				tok.val = currCh - '0';				while (textPtr < textEnd) {					currCh = getChar();					if (currCh >= '0' && currCh <= '9')						tok.val = tok.val * 10 + currCh - '0';					else {						ungetChar(currCh);						break;					}				}				return true;				case ';':                {                	bool	continuation = false;					while (textPtr < textEnd) {						continuation = (currCh == '\\');						currCh = getChar();						if (currCh == '\r' || currCh == '\n')							break;					}					if (textPtr < textEnd) {						UInt32	nextCh = getChar();						if (!((currCh == '\r' && nextCh == '\n') || (currCh == '\n' && nextCh == '\r')))							ungetChar(nextCh);					}					++lineNumber;					if (continuation)						continue;					else {						tok.type = tok_Newline;						return true;					}	            }									case 'U':				// check for U+xxxx USV				if (textPtr < textEnd) {					currCh = getChar();					if (currCh == '+') {						tok.type = tok_USV;						tok.val = 0;						int	digitCount = 0;						while (textPtr < textEnd) {							currCh = getChar();							if (currCh >= '0' && currCh <= '9')								tok.val = tok.val * 16 + currCh - '0';							else if (currCh >= 'a' && currCh <= 'f')								tok.val = tok.val * 16 + currCh - 'a' + 10;							else if (currCh >= 'A' && currCh <= 'F')								tok.val = tok.val * 16 + currCh - 'A' + 10;							else {								ungetChar(currCh);								break;							}							++digitCount;						}						if (digitCount < 4 || digitCount > 6) {							Error("Unicode value (U+xxxx) must have 4-6 hex digits");							tok.val = 0;						}						return true;					}					else						ungetChar(currCh);				}				currCh = 'U';				goto DEFAULT;										// read an identifier or some other 'unknown' character			default:			DEFAULT:				if (isIDstart(currCh)) {					idBuffer[0] = currCh;					tok.val = 1;					while (textPtr < textEnd) {						currCh = getChar();						if (!isIDcont(currCh)) {							ungetChar(currCh);							break;						}						if (tok.val < 256)							idBuffer[tok.val++] = currCh;					}					tok.type = IDlookup(&idBuffer[0], tok.val);					return true;				}				tok.type = tok_Unknown;				tok.val = currCh;				return true;		}	}}boolCompiler::ExpectToken(tokenType type, const char* errMsg){	if (!GetNextToken() || tok.type != type) {		Error(errMsg);		return false;	}	return true;}voidCompiler::Error(const char* msg, const char* s, UInt32 line){	if (line == 0xffffffff)		line = lineNumber;	if (errorFunction == 0) {		cout << "Error: " << msg;		if (s != 0)			cout << ": \"" << s << '"';		cout << " at line " << line << endl;	}	else		(*errorFunction)(errFuncUserData, (char*)msg, (char*)s, line);	errorState = true;	++errorCount;}voidCompiler::StartDefaultPass(){	if ((currentPass.passType & 0xFFFF0000) == (FOUR_CHAR_CODE('N','F','_','_') & 0xFFFF0000)) {		Error("normalization pass cannot contain any other rules");		currentPass.passType = kCode_Unic;	}	if (currentPass.passType == 0) {		currentPass.clear();	// should already be clear!		currentPass.passType = kCode_BU;		currentPass.setLineNo(lineNumber);	}}voidCompiler::AppendToRule(const Item& item){	StartDefaultPass();	switch (ruleState) {		case notInRule:			ruleState = inLHSString;			currentRule.setLineNo(lineNumber);		case inLHSString:			currentRule.lhsString.push_back(item);			break;		case inLHSPreContext:			currentRule.lhsPreContext.push_back(item);			break;		case inLHSPostContext:			currentRule.lhsPostContext.push_back(item);			break;		case inRHSString:			currentRule.rhsString.push_back(item);			break;		case inRHSPreContext:			currentRule.rhsPreContext.push_back(item);			break;		case inRHSPostContext:			currentRule.rhsPostContext.push_back(item);			break;	}}UInt32Compiler::charLimit(){	UInt32	limit;	switch (ruleState) {		case inRHSString:		case inRHSPreContext:		case inRHSPostContext:			limit = (currentPass.passType == kCode_BU || currentPass.passType == kCode_Unic ? 0x10ffff : 0xff);			break;		default:			limit = (currentPass.passType == kCode_UB || currentPass.passType == kCode_Unic ? 0x10ffff : 0xff);			break;	}	return limit;}voidCompiler::AppendLiteral(UInt32 val, bool negate){	StartDefaultPass();	if (val > charLimit()) {		Error("literal value out of range");		return;	}	Item	item;	item.type = 0;	item.negate = negate ? 1 : 0;	item.repeatMin = 0xff;	item.repeatMax = 0xff;	item.val = val;	AppendToRule(item);}voidCompiler::AppendUSV(UInt32 val, bool negate){	StartDefaultPass();	if (charLimit() == 0xff) {		Error("can't use Unicode character in byte encoding");		return;	}	AppendLiteral(val, negate);}voidCompiler::AppendSpecial(UInt8 type, bool negate){	Item	item;	item.type = type;	item.negate = negate ? 1 : 0;	item.repeatMin = 0xff;	item.repeatMax = 0xff;	item.val = 0;	item.start = item.next = item.after = item.index = 0xff;	AppendToRule(item);}voidCompiler::AppendClass(const string& className, bool negate){	StartDefaultPass();	Item	item;	item.type = kMatchElem_Type_Class;	item.negate = negate ? 1 : 0;	item.repeatMin = 0xff;	item.repeatMax = 0xff;	item.val = 0;	const map<string,UInt32>*	classNames;	switch (ruleState) {		case inRHSString:		case inRHSPreContext:		case inRHSPostContext:			classNames = (currentPass.passType == kCode_Byte || currentPass.passType == kCode_UB)							? &currentPass.byteClassNames : &currentPass.uniClassNames;			break;		default:			classNames = (currentPass.passType == kCode_Byte || currentPass.passType == kCode_BU)							? &currentPass.byteClassNames : &currentPass.uniClassNames;			break;	}	map<string,UInt32>::const_iterator	i;	i = classNames->find(className);	if (i == classNames->end())		Error("undefined class", className.c_str());	else		item.val = i->second;	AppendToRule(item);}boolCompiler::tagExists(bool rhs, const string& tag){	if (rhs) {		if (   (findTag(tag, currentRule.rhsString) != -1)			|| (findTag(tag, currentRule.rhsPreContext) != -1)			|| (findTag(tag, currentRule.rhsPostContext) != -1))			return true;	}	else {		if (   (findTag(tag, currentRule.lhsString) != -1)			|| (findTag(tag, currentRule.lhsPreContext) != -1)			|| (findTag(tag, currentRule.lhsPostContext) != -1))			return true;	}	return false;}voidCompiler::AssignTag(const string& tag){	if (currentPass.passType == 0 || ruleState == notInRule) {		Error("item tag doesn't seem to be attached to a rule item", tag.c_str());		return;	}	Item*	item = NULL;	switch (ruleState) {		default:			Error("this can't happen (AssignTag)");			return;		case inLHSString:			if (tagExists(false, tag))				break;			item = &currentRule.lhsString.back();			break;		case inLHSPreContext:			if (tagExists(false, tag))				break;			item = &currentRule.lhsPreContext.back();			break;		case inLHSPostContext:			if (tagExists(false, tag))				break;			item = &currentRule.lhsPostContext.back();			break;		case inRHSString:			if (tagExists(true, tag))				break;			item = &currentRule.rhsString.back();			break;		case inRHSPreContext:			if (tagExists(true, tag))				break;			item = &currentRule.rhsPreContext.back();			break;		case inRHSPostContext:			if (tagExists(true, tag))				break;			item = &currentRule.rhsPostContext.back();			break;	}	if (item == NULL) {		Error("duplicate tag (ignored)", tag.c_str());		return;	}	if (item->tag.length() > 0) {		Error("rule item already has a tag", tag.c_str());		return;	}	switch (item->type) {		case 0:		case kMatchElem_Type_Class:		case kMatchElem_Type_EGroup:		case kMatchElem_Type_ANY:		case kMatchElem_Type_Copy:			item->tag = tag;			break;					default:			Error("invalid use of item tag", tag.c_str());			break;	}}voidCompiler::SetMinMax(int repeatMin, int repeatMax){	Item*	item = 0;	switch (ruleState) {		default:			Error("invalid use of repeat count");			break;		case inLHSString:			item = &currentRule.lhsString.back();			break;		case inLHSPreContext:			item = &currentRule.lhsPreContext.back();			break;		case inLHSPostContext:			item = &currentRule.lhsPostContext.back();			break;		case inRHSString:			item = &currentRule.rhsString.back();			break;		case inRHSPreContext:			item = &currentRule.rhsPreContext.back();			break;		case inRHSPostContext:			item = &currentRule.rhsPostContext.back();			break;	}	if (item) {		switch (item->type) {			case 0:			case kMatchElem_Type_Class:			case kMatchElem_Type_ANY:			case kMatchElem_Type_EGroup:				if (repeatMin > repeatMax || repeatMax < 1 || repeatMax > 15)					Error("invalid repeat counts (0-15 allowed)");				else if (item->repeatMin != 0xff)					Error("multiple repeat counts on item");				else {					item->repeatMin = repeatMin;					item->repeatMax = repeatMax;				}				break;			default:				Error("invalid use of repeat count");				break;		}	}}voidCompiler::setGroupPointers(vector<Item>::iterator b, vector<Item>::iterator e, int startIndex, bool isReversed){// set up the fwd and back pointers on bgroup/or/egroup// and propagate repeat counts from egroup to bgroup	vector<Item>::iterator	base = b;	vector<Item>::iterator	altStart = startIndex > 0 ? base - 1 : e;	bool altSeen = false;	while (b != e) {		if (b->repeatMin == 0xff)			b->repeatMin = 1;		if (b->repeatMax == 0xff)			b->repeatMax = 1;		switch (b->type) {			case 0:	// literal			case kMatchElem_Type_Class:			case kMatchElem_Type_ANY:			case kMatchElem_Type_EOS:				break;						case kMatchElem_Type_OR:				// if startIndex > 0, then initial altStart will be valid				if ((startIndex > 0 || altSeen) && (altStart->type == kMatchElem_Type_OR || altStart->type == kMatchElem_Type_BGroup))					altStart->next = startIndex + (b - base);				else {					Error("this can't happen (setGroupPointers 1)");					return;				}				altStart = b;				altStart->start = startIndex - 1;				altSeen = true;				break;			case kMatchElem_Type_EGroup:				Error("this can't happen (setGroupPointers 2)");				return;			case kMatchElem_Type_BGroup:				{					// need to find corresponding EGroup and copy repeat counts from there					// (or vice versa if this is reversed context)					vector<Item>::iterator subGroupStart = b++;					subGroupStart->next = 0;					int	nestingLevel = 0;					while (b->type != kMatchElem_Type_EGroup || nestingLevel > 0) {						if (b->type == kMatchElem_Type_BGroup)							++nestingLevel;						else if (b->type == kMatchElem_Type_EGroup)							--nestingLevel;						++b;					}					if (isReversed) {						b->repeatMin = subGroupStart->repeatMin;						b->repeatMax = subGroupStart->repeatMax;					}					else {						if (b->repeatMin == 0xff)							b->repeatMin = 1;						if (b->repeatMax == 0xff)							b->repeatMax = 1;						subGroupStart->repeatMin = b->repeatMin;						subGroupStart->repeatMax = b->repeatMax;					}					setGroupPointers(subGroupStart + 1, b, startIndex + (subGroupStart - base + 1), isReversed);					subGroupStart->after = startIndex + (b - base + 1);					b->start = startIndex + (subGroupStart - base);				}				break;		}		++b;	}	if (altSeen)		altStart->next = startIndex + (b - base);	// set NEXT pointer of last OR	if (startIndex > 0) {	// we were handling a group, so set pointers of EGroup		if (b->type == kMatchElem_Type_EGroup)			b->start = startIndex - 1;		else {			Error("this can't happen (setGroupPointers 3)");			return;		}	}}voidCompiler::setGroupPointers(vector<Rule>& rules){	for (vector<Rule>::iterator i = rules.begin(); i != rules.end(); ++i) {		setGroupPointers(i->matchStr.begin(), i->matchStr.end(), 0);		setGroupPointers(i->preContext.begin(), i->preContext.end(
上一页 1 2 3 45
💿 文件大小 7978 K
👤 上传用户 oujk123
📂 所属分类其他行业
🏷️ 相关标签

#text #middleware-layer #information #retrieval
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -