📄 lex.c
字号:
/* * A lexical analyzer for the C language. * John Viega * * Jul 28, 1999 * * Lexical Analyzer based on the reference manual in K+R, 2nd Edition. * Should handle all ANSI C just fine. I think this also happens to * scan all valid ANSI C++. * * This program is a big state machine. Often we will jump from * state to state with a goto. IMHO, the gotos aren't a big deal. * However, this file remains pretty unreadable for other reasons. * I'm basically implementing a fully compliant ANSI C(++) lexer * (minus some non-important stuff) in a single pass. Usually it's * done in multiple passes, probably for clarity's sake... * I was going for minimizing time of implementation, myself. * Multiple passes would have taken me a lot longer to get right wrt * line numbers in some situations. * As a result, there are some gross things such as checking for \ * followed by a newline EVERYWHERE. * * What isn't done: * <><><> Necessary for "correct" functionality for this app <><><> * * - Trigraphs need to be handled. * ??= # * ??/ \ * ??' ^ * ??( [ * ??) ] * ??! | * ??< { * ??> } * ??- ~ * ??? supposedly goes to ? * - trigraphs are supposed to work in a string, bigraphs no. * - I sometimes make an assumption that EOF is going to come after a \n. * * <><><> Desirable, but not necessary <><><> * - Wide strings are processed, but treated like regular strings. * - Keywords are treated as identifiers. That's OK for my purposes, tho. * - See also "TODO:" items in place. * * * Jan 25, 2000: * Apparently, gcc allows $ in identifiers. Might as well recognize it. * * Feb 1, 2000: * Go ahead and treat \r as whitespace in the same context as \t, \n, etc. */#include "lex.H"#include "fatal.H"#include "config.H"#include <ctype.h> // For isalanum(x)#include <stdio.h>#define STD_TOKENS 0#define PREPROC_TOKENS 1Lex::~Lex(){ if(free_input) delete[] input; if(str) delete[] str; if(token_box) delete token_box; if(comment_box) delete comment_box;}Lex::Lex(FILE *f, char *srcid, int cpp){ return_on_error = 0; free_input = 1; const size_t file_incr = 1024*10; // alloc 10K at a time. char *buf = new char[file_incr]; if(!buf) OutOfMemory(); char *buf_pos = buf; long size = 0; while(1) { size_t s = fread(buf_pos, sizeof(char), file_incr, f); size += s; if(s != file_incr) { if(feof(f) > 0) { Init(buf, size, srcid, 1, cpp); return; } else { fprintf(stderr, "%s" NEWLINE, strerror(ferror(f))); fprintf(stderr, "End of file not reached. Progressing anyway." NEWLINE); Init(buf, size, srcid, 1, cpp); return; } } char *tmp = new char[file_incr + size]; if(!tmp) OutOfMemory(); buf_pos = &tmp[size]; memcpy(tmp, buf, size); delete[] buf; buf = tmp; }}Lex::Lex(char* s, long len, char *srcid, unsigned int l, int err, int cpp){ free_input = 0; return_on_error = err; Init(s, len, srcid, l, cpp);}void Lex::Init(char *s, long len, char *srcid, int l, int cpp){ cpp_mode = cpp; input = s; input_size = len; source_id = srcid; token_box = new TokenContainer(); if(!token_box) OutOfMemory(); comment_box = new TokenContainer(); if(!comment_box) OutOfMemory(); str_pos = 0; str_len = 0; pos = 0; lineno_offset = 0; comment_lineno_offset = 0; str = NULL; lineno = l; Scan();}int Lex::LexCComment(){ int t; StartCComment(); while (1) { t = GetChar(); switch(t) { case EOF: return 0; case '\n': AddCharToComment(t); lineno++; comment_lineno_offset++; lineno_offset++; continue; case '*': if ((t = GetChar()) == '/') { EndComment(); return 1; } else { AddCharToComment('*'); UngetChar(t); continue; } default: AddCharToComment(t); continue; } }}void Lex::LexCPPComment(){ int t; StartCPPComment(); while(1) { switch (t=GetChar()) { case EOF: return; case '\\': switch(t=GetChar()) { case '\n': lineno++; comment_lineno_offset++; lineno_offset++; continue; default: UngetChar(t); AddCharToComment('\\'); continue; } case '\n': EndComment(); lineno++; /* Do this for the following: * x = x + // foo * #if 1 * 2; * #endif */ return; default: AddCharToComment(t); continue; } }}void Lex::StartHexChr(char c){ if(isdigit(c)) { chr_val = c - '0'; } else if(islower(c)) { chr_val = c - 'a' + 10; } else { chr_val = c - 'A' + 10; }}void Lex::AddHexChr(char c){ if(isdigit(c)) { chr_val = (chr_val << 4) + (c - '0'); } else if(islower(c)) { chr_val = (chr_val << 4) + (c - 'a' + 10); } else { chr_val = (chr_val << 4) + (c - 'A' + 10); }}void Lex::EndHexChr(){ GenChr(chr_val);}void Lex::StartOctChr(char c){ chr_val = c - '0';}void Lex::AddOctChr(char c){ chr_val = (chr_val << 3) + (c - '0');}void Lex::EndOctChr(){ GenChr(chr_val);}void Lex::StartIdentifier(char ch){ AddCharToStr(ch);}void Lex::ContinueIdentifier(char ch){ AddCharToStr(ch);}void Lex::EndIdentifier(){ str[str_pos++] = '\0'; char *tmp = new char[str_pos]; if(!tmp) OutOfMemory(); strncpy(tmp, str, str_pos); IdTok *tok = new IdTok(tmp, str_pos, lineno-lineno_offset, lineno); if(!tok) OutOfMemory(); token_box->Add(tok); lineno_offset = 0; str_pos = 0;}void Lex::StartHexNum(){ real = 0; num_val = 0; looks_octal = 0; unsigned_flag = 0; long_flag = 0;}void Lex::AddHexDigit(char c){ if(isdigit(c)) { num_val = (num_val << 4) + (c - '0'); } else if(islower(c)) { num_val = (num_val << 4) + (c - 'a' + 10); } else { num_val = (num_val << 4) + (c - 'A' + 10); }}void Lex::EndNum(){ if(!real) { IntegerTok *tok = new IntegerTok( looks_octal ? oct_val : num_val, unsigned_flag, long_flag, lineno-lineno_offset); if(!tok) OutOfMemory(); token_box->Add(tok); lineno_offset = 0; } else { RealTok *tok = new RealTok(num_val, mant_val, exp_neg_flag ? -exp : exp, float_flag ? FLOAT : (long_flag ? LONG_DOUBLE : DOUBLE), lineno - lineno_offset); if(!tok) OutOfMemory(); token_box->Add(tok); lineno_offset = 0; }}void Lex::StartBase10OrLowerNum(char c){ real = 0; mant_val = 0; exp = 0; unsigned_flag = 0; long_flag = 0; float_flag = 0; exp_neg_flag = 0; num_val = 0; switch(c) { case '.': real = 1; return; case '0': looks_octal = 1; oct_val = 0; return; default: looks_octal = 0; num_val = c - '0'; return; }}void Lex::BeginExponent(char c){ switch(c) { case '+': return; case '-': exp_neg_flag = 1; return; default: exp = c - '0'; }}void Lex::AddExponent(char c){ exp = exp * 10 + (c - '0');}void Lex::AddOctDigit(char c) { // In case it turns out to be a float. num_val = (num_val*10) + (c - '0'); oct_val = (oct_val << 3) + (c - '0'); }void Lex::AddDecDigit(char c){ long int *which_val; if(c == '.') { real = 1; return; } if(real == 0) { which_val = &num_val; } else { which_val = &mant_val; } *which_val = *which_val * 10 + (c - '0');}void Lex::MakeLong(){ long_flag = 1;}void Lex::MakeUnsigned(){ unsigned_flag = 1;}void Lex::MakeFloat() { float_flag = 1;}void Lex::GenChr(long c){ CharTok *tok = new CharTok(c, lineno-lineno_offset); if(!tok) OutOfMemory(); token_box->Add(tok); lineno_offset = 0;}void Lex::AddCharToComment(char c){ AddCharToStr(c);}void Lex::LexPreprocNumber(char c){ AddCharToStr(c); while(1) { c = GetChar(); if(!isalnum(c) && (c != '.') && (c != '_') && (c != '\\')) { UngetChar(c); finish_ppnum: if(!str_pos && !str) { str = new char[1]; if(!str) OutOfMemory(); str[0] = 0; } str[str_pos++] = 0; char *tmp = new char[str_pos]; if(!tmp)OutOfMemory(); strncpy(tmp, str, str_pos); PreprocNumTok *tok = new PreprocNumTok(tmp, str_pos-1, lineno-lineno_offset); if(!tok) OutOfMemory(); token_box->Add(tok); lineno_offset = str_pos = 0; return; } switch(c) { case '\\': switch((c = GetChar())) { case '\n': lineno++; lineno_offset++; continue; default: UngetChar(c); UngetChar('\\'); goto finish_ppnum; } case 'e': case 'E': AddCharToStr(c); switch((c = GetChar())) { case '+': case '-': AddCharToStr(c); continue; default: UngetChar(c); continue; } default: AddCharToStr(c); continue; } }}void Lex::AddCharToStr(char c){ if(str_pos == str_len) { char *tmp = new char[str_len + BUFFER_SIZE]; if(!tmp) OutOfMemory(); memcpy(tmp, str, str_pos); str_len += BUFFER_SIZE; if(str) delete[] str; str = tmp; } str[str_pos++] = c;}void Lex::EndStr(){ if(!str_pos && !str) { str = new char[1]; if(!str) OutOfMemory(); str[0] = 0; } str[str_pos++] = 0; char *tmp = new char[str_pos]; if(!tmp) OutOfMemory(); strncpy(tmp, str, str_pos); StringTok *tok = new StringTok(tmp, str_pos-1, lineno - lineno_offset); if(!tok) OutOfMemory(); token_box->Add(tok); lineno_offset = 0; str_pos = 0;}void Lex::EndComment(){ if(!str_pos && !str) { str = new char[1]; if(!str) OutOfMemory(); str[0] = 0; } str[str_pos++] = 0; char *tmp = new char[str_pos]; if(!tmp) OutOfMemory(); strncpy(tmp, str, str_pos); CommentTok *tok = new CommentTok(tmp, str_pos, lineno-lineno_offset, cpp_comment, token_box->GetCurrentSize(), lineno, 0); if(!tok) OutOfMemory(); token_box->Add(tok); // In the comment box, we don't free the string, so we pass the 1 param // to say this. tok = new CommentTok(tmp, str_pos, lineno-lineno_offset, cpp_comment, token_box->GetCurrentSize(), lineno, 1); comment_box->Add(tok); comment_lineno_offset = 0; str_pos = 0;}void Lex::GenOp(char *s){ OperatorTok *tok = new OperatorTok(s, lineno - lineno_offset); if(!tok) OutOfMemory(); token_box->Add(tok); lineno_offset = 0;}int Lex::GetChar(){ if(input_size <= pos) { return EOF; } return input[pos++];}void Lex::UngetChar(int c){ if(c == EOF) return; if(pos <= 0) { return; } input[--pos] = c;}// Return 1 if we found junk, 0 otherwise.int Lex::LexPreprocessorStuff() { int t; while(1) { switch(t = GetChar()) { case '/': try_again: switch(t = GetChar()) { case '\\': switch(t = GetChar()) { case '\n': lineno++; // Don't lineno_offset++; not in a token yet. goto try_again; default: UngetChar(t); UngetChar('\\'); UngetChar('/'); return 0; } case '/': LexCPPComment(); continue; case '*': if(!LexCComment()) { // TODO: Make sure this is ok behavior. if(return_on_error) return 0; fprintf(stderr, "%s: Error: Unterminated comment." NEWLINE, source_id); exit(0); } else { continue; } default: UngetChar(t); UngetChar('/'); return 0; } case '\\': switch(t = GetChar()) { case '\n': lineno++; // Don't lineno_offset++; not in a token yet. continue; default: UngetChar(t); UngetChar('\\'); return 0; } case '#': goto remove_directive; case ' ': case '\t': case '\v': case '\r': case '\f': continue; default: UngetChar(t); return 0; } } remove_directive: Token *tok = new PreprocStartToken(lineno-lineno_offset); if(!tok) OutOfMemory(); token_box->Add(tok); lineno_offset = 0; int old_return_on_error = return_on_error; return_on_error = 1; ScanLine(PREPROC_TOKENS); return_on_error = old_return_on_error; // Preprocessor guy ended on the previous line, since ScanLine // bumped it up by 1. tok = new PreprocEndToken(lineno-lineno_offset-1); if(!tok) OutOfMemory(); token_box->Add(tok); lineno_offset = 0; return 1;}void Lex::Scan() { do { while(LexPreprocessorStuff()); }while(ScanLine(STD_TOKENS));}int Lex::ScanLine(int preproc){ int t, t2, saved_char; next: t = GetChar(); switch(t){ case EOF: return 0; case '\n': lineno++; return 1; case '\t': case '\v': case '\r': case '\f': case ' ': goto next; case ';': GenOp(";"); goto next; case '/': slash_start: switch(t = GetChar()) { case '\\': switch(t = GetChar()) { case '\n': lineno++; lineno_offset++; goto slash_start; default: UngetChar(t); UngetChar('\\'); GenOp("/"); goto next; } case '*': if(!LexCComment()) goto unterminatedCommentError; else { lineno_offset = 0; goto next; } case '/': LexCPPComment(); lineno_offset = 0; return 1; case '=': GenOp("/="); goto next; default: UngetChar(t); GenOp("/"); goto next; } case '-': minus_start: switch(t = GetChar()) { case '\\': switch(t = GetChar()) { case '\n': lineno++; lineno_offset++; goto minus_start; default: UngetChar(t); UngetChar('\\'); GenOp("-"); goto next; } case '-': GenOp("--"); goto next; case '>': GenOp("->"); goto next; case '=': GenOp("-="); goto next; default: UngetChar(t); GenOp("-"); goto next; } case '+': plus_start: switch(t = GetChar()) { case '\\': switch(t = GetChar()) { case '\n': lineno++; lineno_offset++; goto plus_start;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -