📄 scanner.cc
字号:
////////////////////////////////////////////////////////////////////////////////// scanner.cc//#include <strstream>#include <ctype.h>#include "tokens.h"#include "scanner.h"#include "scantab.h"static string_number_t string_hash_table[HASH_TABLE_SIZE];////////// Forward references for static functions://static void insert_hash_table(string_number_t &loc);////////// Establishes initial values for static variables.// Does one sanity check and returns the number of tokens so the// caller can do another sanity check.//int init_scanner(void){ int offset; short hash; char * sp; // Initialize the scanner token hash table. for (int i = 0; i < HASH_TABLE_SIZE; i++) string_hash_table[i] = -1; for (offset = 0; st_string_space[offset]; offset += strlen(sp) + 1) { hash = offset; insert_hash_table(hash); sp = st_string_space + offset; } assert(MAJ_IDENT == st_id_major_num); return st_num_terminals; // return so can check with parser}////////// Inserts a character into the next available slot in StringSpace.// Checks against overflow. Converts all lower-case characters to upper// case, on the assumption that case is insignificant.//static void insert_string_space(char ch){ if (st_used_strings == MAX_STRING_SPACE) { ostrstream ost; ost << "Out of room in scanner string storage area: " << st_used_strings << " of " << MAX_STRING_SPACE; die_compiler(ost.str()); } st_string_space[st_used_strings++] = toupper(ch);}////////// Computes a hash function of the input string by modulo division. Treats// character string as a very long integer, and divides one 'place' at a time.//static hash_index_t hash(string_number_t location){ hash_index_t remainder = 1; while (st_string_space[location] != '\0') { remainder = (((remainder-1) * MAX_CHAR_CODE + st_string_space[location]) % (HASH_TABLE_SIZE)) + 1; location++; } return remainder;}////////// Compares two strings and determines if they are the same.//static bool equal_strings(string_number_t s1, string_number_t s2){ for (; st_string_space[s1] == st_string_space[s2]; s1++, s2++) { if (st_string_space[s1] == '\0') { return true; } } return false;}////////// Put token into the hash table. Guards against hash table overflow.// loc comes in with the index in st_string_space of the token to be inserted.// This token MUST BE the last valid thing currently in st_string_space.// If a token is a copy of one already found, the new version is removed, by// decrementing st_used_strings. In that event, parameter loc is modified// to index of the existing version.//static void insert_hash_table(string_number_t &loc){ hash_index_t hash_value; hash_index_t bucket; hash_value = hash(loc); bucket = hash_value; for (;;) { if (string_hash_table[bucket - 1] == -1) { string_hash_table[bucket-1] = loc; break; } else { // this bucket is used -- by the same token? if (equal_strings(loc, string_hash_table[bucket - 1])) { st_used_strings = loc; // discard duplicate loc = string_hash_table[bucket - 1]; // return existing copy break; } else // we have a collision and must re-hash { bucket = (bucket + hash_value) % HASH_TABLE_SIZE; if (bucket == hash_value) { // we have looked at every bucket ostrstream ost; ost << "Out of room in scanner hash table: " << bucket << "==" << hash_value; die_compiler(ost.str()); } } } }}////////// Allows external routines to insert identifiers into StringSpace and// the HashTable. Used to set up pre-defined objects.//string_number_t define_ident(const char *ident){ string_number_t start; start = st_used_strings; for (int i = 0; ident[i]; i++) insert_string_space(ident[i]); insert_string_space('\0'); insert_hash_table(start); return start;}////////// Copes with 'illegal' states in the scanner pt_table// caused by invalid input or insufficient look-ahead.// Returns a new value for the state vector, to replace// the old value, which was 'illegal.'//// state_vector_t is a new value for the state vector,// designed to allow (nearly) normal continuation of the algorithm.//// If the issue_error was the result of invalid input, the// illegal token will be removed from StringSpace.// If the issue_error was due to the inability of the tables// to provide for look-ahead, the character mistakenly// consumed is returned to the input buffer.//// Note that this procedure works by means of fooling the// rest of the scanner into thinking that it had a valid// terminal state all along.//static int error_chars;static int nonwhite_since_t;static void handle_table_error(state_vector_t *state_vector, scanner_state_t prev_terminal, char current_char, int chars_seen, int chars_appended){ if (chars_seen == 0 // ie. current state is st_start_state && current_char == FILE_END) { // we need to return End-Of-File to whoever called the scanner // note that an issue_error can never occur in a terminal state state_vector->action = HALT_NO_APPEND; state_vector->major = st_end_of_file; state_vector->minor = 0; } else { if ((prev_terminal >= 0)&&(prev_terminal != st_start_state)) { // we have had a terminal state in this token unread_chars(chars_seen); state_vector->action = HALT_REUSE; st_used_strings = st_used_strings - chars_appended; state_vector->major = st_state_info[(int)prev_terminal].default_major; state_vector->minor = st_state_info[(int)prev_terminal].default_minor; } else // we have a lexical issue_error { error_chars = error_chars + nonwhite_since_t + 1; st_used_strings = st_used_strings - chars_appended; state_vector->action = HALT_NO_APPEND; state_vector->major = MAJ_SPACE; state_vector->minor = 0; } }}////////// Looks up an identifier in the st_reserver_words array, and returns// its major and minor token numbers. Does nothing at all if identifier// is NOT a reserve word. Uses a simple binary search.//static void find_reserve_word(string_number_t location, major_token_t& major_num, major_token_t& minor_num){ int low; int middle; int high; string_number_t current_space_point; low = 0; high = st_num_reserved_words - 1; if (location > st_max_reserve_location) { return; } for (;;) { middle = (low + high) / 2; current_space_point = st_reserved_words[middle].text_handle; if (location == current_space_point) // we are done break; // leave loop else if (location < current_space_point) high = middle - 1; // search lower half else low = middle + 1; // search upper half } major_num = st_reserved_words[middle].major; minor_num = st_reserved_words[middle].minor;}////////// Lexical Scanner:// Retrieves next token from input file and returns descriptive information.// Never returns white space. Returns an actual token for end-of-file, one// larger than largest real token.//major_token_t token_get(token_attrib_t& attr){ string_number_t token_start; scanner_state_t new_state; scanner_state_t prev_terminal; char_class_t next_class; int chars_seen; int chars_appended; char next_char; state_vector_t state_vector; scanner_act_t next_action; bool exist_lower_case; error_chars = 0; do { token_start = st_used_strings; exist_lower_case = false; prev_terminal = -1; // in case we bomb on early char chars_seen = 0; nonwhite_since_t = 0; chars_appended = 0; new_state = st_start_state; next_char = read_char(); attr.location = get_source_location(); do { next_class = st_char_class[next_char]; state_vector = st_state_matrix[new_state][next_class]; if (state_vector.action == ERROR) handle_table_error(&state_vector, prev_terminal, next_char, chars_seen, chars_appended); next_action = state_vector.action; if (next_action == MOVE_APPEND || next_action == MOVE_NO_APPEND) { if (st_state_info[(int)new_state].terminal) { prev_terminal = new_state; chars_seen = 0; nonwhite_since_t = 0; chars_appended = 0; } new_state = state_vector.next_state; chars_seen++; if (next_char != ' ' && next_char != '\t' && next_char != '\n') nonwhite_since_t = nonwhite_since_t + 1; if (next_action == MOVE_APPEND) { if (next_char >= 'a' && next_char <= 'z') exist_lower_case = true; insert_string_space(next_char); chars_appended++; } next_char = read_char(); } } while (next_action == MOVE_APPEND || next_action == MOVE_NO_APPEND); if (next_action == HALT_APPEND) { if (next_char >= 'a' && next_char <= 'z') exist_lower_case = true; insert_string_space(next_char); } else { if (next_action == HALT_REUSE) { // return next_char unread_chars(1); } } attr.major = state_vector.major; attr.minor = state_vector.minor; insert_string_space('\0'); // separates tokens if (token_start != st_used_strings) { // we didn't toss the whole thing insert_hash_table(token_start); attr.text_handle = token_start; } else { attr.text_handle = 0; } if (attr.major == st_id_major_num && attr.minor == st_id_minor_num) find_reserve_word(attr.text_handle, attr.major, attr.minor); } while (attr.major == MAJ_SPACE); if (error_chars > 0 && !scanner_peeking) { done_correcting(); if (error_chars == 1) { issue_warning(attr.location, "Invalid token. 1 non-white character ignored."); } else { ostrstream ost; ost << "Invalid token. " << error_chars << " non-white characters ignored."; issue_warning(attr.location, ost.str()); } } return attr.major;}////////// Pretend an inserted token was really scanned. Pick an appropriate minor// number for those tokens that have them.//token_attrib_t token_fake(major_token_t tok){ token_attrib_t rtn; rtn.major = tok; rtn.text_handle = 0; rtn.location = get_source_location(); // close enough switch (tok) { case MAJ_ADDOP: rtn.minor = MIN_PLUS; break; case MAJ_MULOP: rtn.minor = MIN_TIMES; break; case MAJ_RELOP: rtn.minor = MIN_NE; break; case MAJ_EQ: rtn.minor = MIN_EQ; break; default: rtn.minor = 0; break; } return rtn;}////////// Returns a pointer to the character string for a token attribute.//const char * token_attrib_t::get_text(void) const{ return token_chars(text_handle);}////////// Return (non-tossed) text of a token//string token_text(string_number_t where){ if (where) { // not a faked token return string(st_string_space + where); } return string("");}////////// Return a character pointer to the string's space.//const char *token_chars(string_number_t where){ if (where != 0) { // not a faked token return st_string_space + where; } return "";}// End of File
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -