📄 ngrm_read_0.cc
字号:
// file: ngrm_read_0.cc//// system include files//#include <string.h>#include <ctype.h>// isip include files//#include "ngram.h"#include "ngram_constants.h"#include <link_list.h>#include <link_node.h>static int compare_nodes_cc(const void* first, const void* second) { return Ngram::ngnode_compare_cc((Ngram_node**)first, (Ngram_node**)second);}// method: read_ngram_cc//// arguments:// FILE* fp_a: (input) input ngram file// Hash_table* lexicon: (input) the lexicon hash table//// return: a logical_1 indicating status//// this method reads in the ngram from a file//logical_1 Ngram::read_ngram_cc(FILE* fp_a, Hash_table* lexicon_a) { // define some local variables // Memory_manager* mmgr = Link_list::get_manager_cc(); Link_list tmp_symbol_list; char_1* buffer = new char_1[ISIP_MAX_STRING_LENGTH]; char_1* bufpos = (char_1*)NULL; char_1* delimiter = NGRAM_DELIMITER; logical_1 flag = ISIP_TRUE; int_4 index = (int_4)0; int_4 num_grams = (int_4)0; Word* word = (Word*)NULL; Ngram_node** ngnode = (Ngram_node**)NULL; Hash_cell* hcell = (Hash_cell*)NULL; float_4 backoff = (float_4)0; float_4 lmscore = NGRAM_NODE_DEFAULT_SCORE; // define the ngram lists // if (ngram_lists_d == (Ngram_list**)NULL) { ngram_lists_d = new Ngram_list*[ngram_order_d]; } for (int_4 i = 0; i < ngram_order_d; i++) { ngram_lists_d[i] = (Ngram_list*)NULL; } // initialize numbers // int_4* num_read = new int_4[ngram_order_d]; for (int_4 i = 0; i < ngram_order_d; i++) { num_read[i] = (int_4)0; } // we keep track of the current ngram prefix. since the input is sorted, // this makes it easier to know when to trigger events in the read // process // Word** curr_prefix = new Word*[ngram_order_d]; memset(curr_prefix, 0, ngram_order_d * sizeof(Word*)); logical_1 prefix_changed = ISIP_FALSE; int_4 prefix_items = 0; Ngram_node** prefix_node = (Ngram_node**)NULL; // read the ngram information such as the number of unigrams, // bigrams etc // while (fscanf(fp_a, "%s", buffer) != EOF) { // ignore comment lines // if (buffer[0] == (char_1)'#') { // do nothing // fscanf(fp_a, "%[^\n]", buffer); fscanf(fp_a, "%[\n]", buffer); } // read data // else if (buffer[0] == (char_1)'\\') { // advance pointer // bufpos = buffer; bufpos++; // read the ngram counts // if (strcmp((char*)bufpos, (char*)NGRAM_DATA_TAG) == 0) { // prepare to read the next line // fscanf(fp_a, "%[^\n]", buffer); fscanf(fp_a, "%[\n]", buffer); // read the ngram counts // for (int_4 i = 0; i < ngram_order_d; i++) { fscanf(fp_a, "%[^\n]", buffer); bufpos = (char_1*)strtok((char*)buffer, ISIP_STRING_SPACE); while (isspace((char)(*bufpos))) { bufpos++; } bufpos = (char_1*)strtok((char*)NULL, (char*)delimiter); index = (int_4)atoi((char*)bufpos) - (int_4)1; bufpos = (char_1*)strtok((char*)NULL, (char*)delimiter); num_grams = (int_4)atoi((char*)bufpos); fscanf(fp_a, "%[\n]", buffer); // set up the corresponding ngram list // ngram_lists_d[index] = new Ngram_list(num_grams); } } // end if data // read lm parameters if specified // else if (strcmp((char*)bufpos, (char*)NGRAM_LM_TAG) == 0) { // prepare to read the next line // fscanf(fp_a, "%[^\n]", buffer); fscanf(fp_a, "%[\n]", buffer); // read the lm scale // fscanf(fp_a, "%[^\n]", buffer); bufpos = (char_1*)strtok((char*)buffer, (char*)delimiter); bufpos = (char_1*)strtok((char*)NULL, (char*)delimiter); lmscale_d = (float_4)atof((char*)bufpos); fscanf(fp_a, "%[\n]", buffer); // read the word penalty // fscanf(fp_a, "%[^\n]", buffer); bufpos = (char_1*)strtok((char*)buffer, (char*)delimiter); bufpos = (char_1*)strtok((char*)NULL, (char*)delimiter); wdpenalty_d = (float_4)atof((char*)bufpos); fscanf(fp_a, "%[\n]", buffer); } // otherwise this is an ngram list // else { // we may need to clean up after an ngram read // // see if the prefix has changed. if so, then dump the current ngram // data // // if there are items for this prefix then add them // if (prefix_items > 0) { // sort the items in the list // rearrange_nodes_cc(index, prefix_items, tmp_symbol_list, prefix_node, num_read); } // reset variables // prefix_items = 0; // reset the prefix node // prefix_node = (Ngram_node**)NULL; // clear the prefix // memset(curr_prefix, 0, ngram_order_d * sizeof(Ngram_node*)); prefix_changed = ISIP_TRUE; // get the order of the current ngram list // bufpos = (char_1*)strtok((char*)buffer, "-"); bufpos = buffer; bufpos++; index = (int_4)atoi((char*)bufpos) - 1; // prepare to read the next line // fscanf(fp_a, "%[^\n]", buffer); fscanf(fp_a, "%[\n]", buffer); } } // end else if char is "\" // otherwise this is a data line // else { // break flag - if the particular ngram is not valid then we just skip // that line // logical_1 break_flag = ISIP_FALSE; // reset flag // flag = ISIP_TRUE; // set the lm score // lmscore = (float_4)atof((char*)buffer); // as the score is in log10 form, convert it to log_e // lmscore *= NGRAM_LOGTEN; // reset the ngram node // ngnode = (Ngram_node**)NULL; // check if there are any history words and read them // for (int_4 i = 0; i < index; i++) { // read a word // fscanf(fp_a, "%s", buffer); // get the corresponding word from the lexicon // hcell = lexicon_a->hash_lookup_cc(buffer); if (hcell == (Hash_cell*)NULL) { flag = ISIP_FALSE; fprintf(stdout, "%s::read_ngram_cc : word %s not in lexicon\n", NGRAM_CLASS_NAME, buffer); } else { word = (Word*)(hcell->get_item_cc()); } // check the prefix // if (word != curr_prefix[i]) { curr_prefix[i] = word; prefix_changed = ISIP_TRUE; } // get the ngram node corresponding to this word at this // level // ngram_lists_d[i]->get_node_cc(word, ngnode); // if the ngram lookup failed then break out and do not process this // ngram (it is invalid) // if (ngnode == (Ngram_node**)NULL) { break_flag = ISIP_TRUE; break; } } // end for loop // see if the prefix has changed. if so, then dump the current ngram // data // if (prefix_changed == ISIP_TRUE) { // if there are items for this prefix then add them // if (prefix_items > 0) { // sort the items in the list // rearrange_nodes_cc(index, prefix_items, tmp_symbol_list, prefix_node, num_read); } // reset variables // prefix_items = 0; prefix_changed = ISIP_FALSE; // reset the prefix node // prefix_node = ngnode; } if (break_flag != ISIP_TRUE) { // read the current word // fscanf(fp_a, "%s", buffer); // get the corresponding word from the lexicon // hcell = lexicon_a->hash_lookup_cc(buffer); if (hcell == (Hash_cell*)NULL) { flag = ISIP_FALSE; fprintf(stdout, "%s::read_ngram_cc : word %s not in lexicon\n", NGRAM_CLASS_NAME, buffer); } else { word = (Word*)(hcell->get_item_cc()); } // read the backoff score if any // backoff = (float_4)0; fscanf(fp_a, "%[^\n]", buffer); bufpos = (char_1*)strtok((char*)buffer, ISIP_STRING_SPACE); if (bufpos != (char_1*)NULL) { while (isspace((char)(*bufpos))) { bufpos++; } backoff = (float_4)atof((char*)bufpos); // as the score is in log10 form, convert it to log_e // backoff *= NGRAM_LOGTEN; } // if the current word exists in the lexicon // if (flag == ISIP_TRUE) { // create an n-gram node // Ngram_node* new_ngram = mmgr->new_ngram_cc(); new_ngram->set_word_cc(word); new_ngram->set_lmscore_cc(lmscore); new_ngram->set_backoff_cc(backoff); // put the node in the list // Link_node* new_node = mmgr->new_node_cc(); new_node->set_item_cc(new_ngram); tmp_symbol_list.push_cc(new_node); // increment the number of items for this prefix // prefix_items++; } // prepare to read next line // fscanf(fp_a, "%[\n]", buffer); } else { // read to the end of line // fscanf(fp_a, "%[^\n]", buffer); fscanf(fp_a, "%[\n]", buffer); } } // end else this is data line } // end while // we may need to clean up after an ngram read // // see if the prefix has changed. if so, then dump the current ngram // data // // if there are items for this prefix then add them // if (prefix_items > 0) { // sort the items in the list // rearrange_nodes_cc(index, prefix_items, tmp_symbol_list, prefix_node, num_read); } // reset variables // prefix_items = 0; // reset the prefix node // prefix_node = (Ngram_node**)NULL; // clear the prefix // memset(curr_prefix, 0, ngram_order_d * sizeof(Ngram_node*)); prefix_changed = ISIP_TRUE; // confirm counts and adjust memory in ngram list if needed // for (int_4 i = 0; i < ngram_order_d; i++) { // if the read count is larger flag error if (num_read[i] > ngram_lists_d[i]->get_num_cc()) { error_handler_cc((char_1*)"read_ngram_cc", (char_1*)"Mismatch in number of ngrams read."); } // otherwise remove space out of vocabulary words // else if (num_read[i] < ngram_lists_d[i]->get_num_cc()) { ngram_lists_d[i]->adjust_num_cc(num_read[i]); } } // free memory // delete [] num_read; delete [] buffer; delete [] curr_prefix; // exit gracefully // return ISIP_TRUE;}// method: rearrange_nodes_cc//// arguments:// int_4 index: (input) the list to add the item to// int_4 prefix_items: (input) number of items in the list// Link_list& tmp_symbol_list: (input/output) nodes to add// Ngram_node** prefix_node: (input) node to attach the list of nodes to.// int_4* num_read_a; (input) number of items read at each ngram level//// return: an int_4 indicating the number of items transferred//// this method sorts the items in the list by Word ID and then attaches them// to the prefix node in the ngram list//int_4 Ngram::rearrange_nodes_cc(int_4 index_a, int_4 prefix_items_a, Link_list& tmp_symbol_list_a, Ngram_node** prefix_node_a, int_4* num_read_a) { // declare local variables // Memory_manager* mmgr = Link_list::get_manager_cc(); Ngram_node** ngnode_array = new Ngram_node*[prefix_items_a]; int_4 loc = 0; Link_node* tmp_node = (Link_node*)NULL; // remove the items from the list and put them in an array so we // can sort them // while ((tmp_node = tmp_symbol_list_a.pop_cc()) != (Link_node*)NULL) { // insert the item in the array // ngnode_array[loc++] = (Ngram_node*)tmp_node->get_item_cc(); // delete the node // mmgr->delete_cc(tmp_node); } // sort the array // qsort(ngnode_array, prefix_items_a, sizeof(Ngram_node*), compare_nodes_cc); // add the nodes to the list one by one // for (int_4 jj = 0; jj < prefix_items_a; jj++) { Ngram_node* tmp_node = ngnode_array[jj]; // add each node to the list // ngram_lists_d[index_a]->add_node_cc(num_read_a[index_a], tmp_node->get_word_cc(), prefix_node_a, tmp_node->get_lmscore_cc(), tmp_node->get_backoff_cc()); // delete the ngram node // mmgr->delete_cc(tmp_node); // increment count // num_read_a[index_a]++; } // clean up memory // delete [] ngnode_array; // set the number of next grams for the prefix node // if (prefix_node_a != (Ngram_node**)NULL) { (*prefix_node_a)->set_num_next_nodes_cc(prefix_items_a); } // exit gracefully // return ISIP_TRUE;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -