lexer.cpp

来自「编译原理的作业编译器」· C++ 代码 · 共 581 行
CPP
581 行
#include "lexer.h"

Token::Token()
{
	arg = 0;
	type = TOK_BAD;
}

Line::Line()
{
	_lineNum = 1;
	_eof = false;
	_fileName = NULL;
}

File::File(const char* fileName)
{
	_curPos = 0;
	_curLine = 0;
	_numElems = 0;
	strcpy(_fileName, fileName);
	_file = fopen(fileName, "r");

	if (_file == NULL)
	{
		printf("Source file open error: %s\n", fileName);
		exit(0);
	}
}

void File::GetLine(Line& line)
{
	int i = 0;
	++_curLine;
	char ch = GetChar();

	if (ch == EOF)
	{
		line._eof = true;
	}

	while ((ch != '\n') && (ch != EOF))
	{
		line._buffer[i++] = ch;
		ch = GetChar();

		if (ch == EOF)
		{
			line._eof = true;
		}
	}

	line._buffer[i] = '\0';
	line._lineNum = _curLine;
	line._fileName = _fileName;
}

char File::GetChar()
{
	if (_curPos == BUFFER_SIZE || _numElems == 0)
	{
		FillBuffer();
	}

	return _buffer[_curPos++];
}


void File::FillBuffer()
{
	int cb = fread(_buffer, sizeof(char), BUFFER_SIZE, _file);

	if (cb < BUFFER_SIZE)
	{
		if (!feof(_file))
		{
			printf("Source file reading error%s\n", _fileName);
			exit(0);
		}
		else
		{
			_numElems = cb;
			_buffer[cb] = EOF;
		}
	}
	_numElems = BUFFER_SIZE;
	_curPos = 0;
}

Lexer::Lexer(File& file, Line& line)
:_curFile(file), _curLine(line), _curPos(0), _lastTokPos(0)
{
	MapBucket b;
	b._str = "s8";
	b._index = Int8;
	_keywords.Insert(b);
	b._str = "s16";
	b._index = Int16;
	_keywords.Insert(b);
	b._str = "s32";
	b._index = Int32;
	_keywords.Insert(b);
	b._str = "s64";
	b._index = Int64;
	_keywords.Insert(b);
	b._str = "u8";
	b._index = UInt8;
	_keywords.Insert(b);
	b._str = "u16";
	b._index = UInt16;
	_keywords.Insert(b);
	b._str = "u32";
	b._index = UInt32;
	_keywords.Insert(b);
	b._str = "u64";
	b._index = UInt64;
	_keywords.Insert(b);
	b._str = "f32";
	b._index = Float32;
	_keywords.Insert(b);
	b._str = "f64";
	b._index = Float64;
	_keywords.Insert(b);
	b._str = "void";
	b._index = Void;
	_keywords.Insert(b);
	_directives.Insert("entry");
}


void Lexer::Forward()
{
	++_curPos;
}

void Lexer::Back()
{
	--_curPos;
}

char Lexer::GetNextChar()
{
	return _curLine._buffer[_curPos++];
}

bool Lexer::Match(Token& tok, TokType type)
{
	if ((tok.type != type))
	{
		return false;
	}
	return true;
}


void Lexer::GetNextToken(Token& tok)
{
	char ch = SkipWhiteSpace();
	_lastTokPos = _curPos;

	if (ch == '(')
	{
		tok.text[0] = ch;
		tok.text[1] = '\0';
		tok.type = TOK_LP_S;
	}
	else if (ch == ')')
	{
		tok.text[0] = ch;
		tok.text[1] = '\0';
		tok.type = TOK_RP_S;
	}
	else if (ch == '[')
	{
		tok.text[0] = ch;
		tok.text[1] = '\0';
		tok.type = TOK_LP_M;
	}
	else if (ch == ']')
	{
		tok.text[0] = ch;
		tok.text[1] = '\0';
		tok.type = TOK_RP_M;
	}
	else if (ch == '{')
	{
		tok.text[0] = ch;
		tok.text[1] = '\0';
		tok.type = TOK_LP_B;
	}
	else if (ch == '}')
	{
		tok.text[0] = ch;
		tok.text[1] = '\0';
		tok.type = TOK_RP_B;
	}
	else if (ch == '.')
	{
		ProcessDirective(tok);
	}
	else if (ch == '\'')
	{
		ProcessCharConst(tok);
	}
	else if (ch =='\"')
	{
		ProcessStrConst(tok);
	}
	else if ( ((ch >= 'a') && (ch <= 'z')) ||
		((ch >= 'A') && (ch <= 'Z')) || (ch == '_'))
	{
		ProcessIdentifier(tok);
	}
	else if ( ((ch >= '0') && (ch <= '9')) ||
		(ch == '-') && (ch == '+') )
	{
		ProcessNumConst(tok);
	}
	else if (ch == ',')
	{
		tok.type = TOK_COMMA;
		tok.text[0] = ch;
		tok.text[1] = '\0';
	}
	else if (ch == ';')
	{
		tok.text[0] = ';';
		tok.text[1] = '\0';
		tok.type = TOK_SEMICOLON;
	}
	else if (ch == '\0')
	{
		if (_curLine._eof)
		{
			tok.type = TOK_EOF;
			strcpy(tok.text, "EOF");
		}
		else
		{
			_curFile.GetLine(_curLine);
			_curPos = 0;
			GetNextToken(tok);
		}
	}
	else
	{
		tok.text[0] = ch;
		tok.text[1] = '\0';
		tok.type = TOK_BAD;
		OutputError(_curLine, ER_ELF0006);
	}
}


char Lexer::SkipWhiteSpace()
{
	char ch = GetNextChar();

	while (ch == ' ' || ch == '\n' || ch == '\t')
	{
		ch = GetNextChar();
	}
	return ch;
}

void Lexer::PrintToken(Token& tok)
{
	switch(tok.type) 
	{
	case TOK_BAD:
		printf("token is bad: %s\n\n", tok.text);
		break;
	case TOK_IDENTIFIER:
		printf("Identifier: %s\n", tok.text);
		break;
	case TOK_CHAR_CONST:
		printf("Char const: %c\n", tok.arg);
		break;
	case TOK_NOMORE:
		printf("No more\n");
		break;
	case TOK_INT_CONST:
		printf("int const: %s, arg %d\n", tok.text, tok.arg);
		break;
	case TOK_FLT_CONST:
		printf("float const: %s, arg %lf\n", tok.text, (*(double*)(&tok.arg)));
		break;
	case TOK_COMMA:
		printf("comma\n");
		break;
	case TOK_STR_CONST:
		printf("string: %s\n", tok.text);
		break;
	default:
		printf("error not a token\n");
	}
}

void Lexer::ProcessCharConst(Token& tok)
{
	char ch, next;
	ch = GetNextChar();

	if (ch >= 32 && ch <= 126)
	{
		tok.type = TOK_CHAR_CONST;
		tok.text[0] = ch;
		tok.text[1] = '\0';
		tok.arg = ch;
		next = GetNextChar();

		if (next != '\'')
		{
			tok.type = TOK_BAD;
			tok.text[0] = '\'';
			tok.text[1] = ch;
			tok.text[2] = next;
			tok.text[3] = '\0';
		}
	}
	else
	{
		tok.text[0] = '\'';
		tok.text[1] = ch;
		tok.text[2] = '\0';
	}
}

void Lexer::ProcessDirective(Token& tok)
{
	int i = 0;
	tok.type = TOK_DIRECTIVE;
	char ch = GetNextChar();
	while ( ((ch >= 'a') && (ch <= 'z')) ||
		((ch >= 'A') && (ch <= 'Z')) )
	{
		tok.text[i++] = ch;
		ch = GetNextChar();
	}
	tok.text[i] = '\0';

	if (i == 0)
	{
		OutputError(_curLine, ER_ELF0005);
	}
}

void Lexer::ProcessStrConst(Token& tok)
{
	int i = 0;
	char ch = GetNextChar();

	while (ch != '\"')
	{
		if (ch == '\\')
		{
			switch(GetNextChar())
			{
			case 'n':
				ch = '\n';
				break;
			case 't':
				ch = '\t';
				break;
			case 'r':
				ch = '\r';
				break;
			default:
				printf("Error: String not recognized\n");
				exit(0);
			}
		}
		else if (ch == '\0')
		{
			tok.type = TOK_BAD;
			return;
		}
		tok.text[i++] = ch;
		ch = GetNextChar();
	}
	tok.type = TOK_STR_CONST;
	tok.text[i] = '\0';
}

void Lexer::ProcessNumConst(Token& tok)
{
	int i = 0;
	Back();
	char ch = GetNextChar();

	if (ch == '+' || ch == '-')
	{//num -> (+|-)?digits;
		tok.text[i++] = ch;
		ch = GetNextChar();
	}

	if (ch >= '0' && ch <= '9')
	{
		if (ch == 0)
		{
			ch = GetNextChar();
			if (ch == '.')
			{//0.digits;
				ch = '0';
				Back();
			}
			else
			{//zero
				tok.text[i++] = ch;
				tok.text[i] = '\0';
				tok.type = TOK_INT_CONST;
				return;
			}
		}

		while (ch >='0' && ch <= '9')
		{//digits -> 0|1|2....9;
			tok.text[i++] = ch;
			ch = GetNextChar();
		}

		if (ch == '.')
		{
			tok.text[i++] = '.';
			ch = GetNextChar();

			if (ch >= '0' && ch <= '9')
			{//num -> (+|-)?(digits)*.(digits)*
				while (ch >= '0' && ch <= '9')
				{
					tok.text[i++] = ch;
					ch = GetNextChar();
				}
			}
			else
			{//(+|-)?(digits)*.x (x not digits)
				tok.text[i++] = '\0';
				tok.type = TOK_BAD;
				Back();
				return;
			}

			if (ch == 'e' || ch == 'E')
			{//(+|-)?(digits)*.(digits)*(e|E)?
				tok.text[i++] = ch;
				ch = GetNextChar();

				if (ch == '+' || ch == '-')
				{//(+|-)?(digits)*.(digits)*(e|E)?(+|-)?
					tok.text[i++] = ch;
					ch = GetNextChar();
				}

				if (ch >= '0' && ch <= '9')
				{//(+|-)?(digits)*.(digits)*(e|E)?(+|-)?(digits)*
					while (ch >= '0' && ch <= '9')
					{
						tok.text[i++] = ch;
						ch = GetNextChar();
					}

					tok.text[i] = '\0';
					tok.type = TOK_FLT_CONST;
					double n = atof(tok.text);
					tok.arg = (*(S64*)(&n));
					Back();
					CheckNumeric(tok);
					return;
				}
				else
				{//(+|-)?(digits)*.(digits)*(e|E)?(+|-)?x (x not a digit)
					tok.text[i] = '\0';
					tok.type = TOK_BAD;
					Back();
				}
			}
			else
			{//(+|-)?(digits)*.(digits)*
				tok.text[i++] = '\0';
				tok.type = TOK_FLT_CONST;
				double n = atof(tok.text);
				tok.arg = (*(S64*)(&n));
				Back();
				CheckNumeric(tok);
				return;
			}
		}
		else if (ch == 'e' || ch == 'E')
		{//(+|-)?(digits)*(e|E)?
			tok.text[i++] = ch;
			ch = GetNextChar();

			if (ch == '+' || ch == '-')
			{//(+|-)?(digits)*(e|E)?(+|-)?
				tok.text[i++] = ch;
				ch = GetNextChar();
			}

			if (ch >= '0' && ch <= '9')
			{//(+|-)?(digits)*(e|E)?(+|-)?(digits)*
				while (ch >= '0' && ch <= '9')
				{

					tok.text[i++] = ch;
					ch = GetNextChar();
				}
				tok.text[i] = '\0';
				tok.type = TOK_FLT_CONST;
				double n = atof(tok.text);
				tok.arg = (*(S64*)(&n));
				Back();
				CheckNumeric(tok);
				return;
			}
			else
			{
				tok.text[i] = '\0';
				tok.type = TOK_BAD;
				Back();
				return;
			}
		}
		else
		{//(+|-)?(digits)*
			tok.text[i] = '\0';
			tok.type = TOK_INT_CONST;
			tok.arg = _atoi64(tok.text);
			Back();
			CheckNumeric(tok);
			return;
		}
	}
	else
	{
		tok.text[i] = '\0';
		tok.type = TOK_BAD;
		Back();
	}
}

void Lexer::CheckNumeric(Token& tok)
{
	int len = strlen(tok.text);

	if (len > 11)
	{
		tok.type = TOK_BAD;
		strcpy(tok.text, "Too long numeric digits");
	}
}



void Lexer::ProcessIdentifier(Token& tok)
{
	tok.type = TOK_IDENTIFIER;
	int i = 0;
	Back();
	char ch = GetNextChar();

	while ( ((ch >= 'a') && (ch <= 'z')) ||
		((ch >= 'A') && (ch <= 'Z')) || 
		(ch == '.') ||(ch == '_') || 
		((ch <= '9') && (ch >= '0')) )
	{	
		tok.text[i++] = ch;
		ch = GetNextChar();
	}

	Back();		//....
	tok.text[i] = '\0';

	MapBucket b = _keywords.Exsits(MapBucket(tok.text));
	if (b._index != -1)
	{
		tok.type = TOK_KEYWORD;
		tok.arg = b._index;
	}
}
lexer.cpp - 源码说明

本页面展示了「编译原理的作业编译器」中的 lexer.cpp 源码文件，采用 C++ 编程语言编写，共 581 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与编译原理相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?