📄 ngmp_05.cc
字号:
// file: $isip/class/mmedia/NGramParser/ngmp_05.cc// version: $Id: ngmp_05.cc,v 1.6 2002/07/10 16:23:24 alphonso Exp $//// isip include files//#include "NGramParser.h"#include <Sof.h>// method: load//// arguments:// Sof& sof: (input) input ngram source file// HashTable<Long, NGramNode>& gram_hash: (output) output gram hash table// const Vector<String>& symbol_table: (input) symbol table mapping index// const long& tag: (input) sof object instance name// const String& name: (input) sof object instance name//// return: a boolean indicating status//// this method reads in the ngram from a file with various format//boolean NGramParser::load(Sof& sof_a, HashTable<Long, NGramNode>& gram_hash_a, const Vector<String>& symbol_table_a, const long& tag_a, const String& name_a) { // define some local variables // String buffer, buf; Char chr; VectorLong gram_size(order_d); // the size of each gram VectorLong num_gram_read(order_d); // the size of each gram Vector<String> symbol(1); // N-gram symbols const Long *symbol_index; // indices of N-gram symbols SingleLinkedList<NGramNode> symbol_list; // the list of ngram nodes NGramNode ngrm_node; NGramNode* prefix_node = NULL; NGramNode* history_node = NULL; SysString delimiter(L" \t="); HashTable<Long, NGramNode>* curr_hash = &gram_hash_a; HashTable<String, Long> symbol_hash; long gram_num; long pos, index = 0; boolean flag = true; float backoff = NGramNode::DEF_BACKOFF; float lmscore = NGramNode::DEF_LM_SCORE; long tmp_long; long curr_order = -1; // current order of grammar reading: 0, 1, 2, .. // debug variables // String value, output, empty_str; // initialize numbers // num_gram_read.assign((long)0); // get hash table which maps String to Long // long len = symbol_table_a.length(); Long symbol_id; symbol_hash.setCapacity(len); for (long i = 0; i < len; i++) { symbol_id = i; symbol_hash.insert(symbol_table_a(i), &symbol_id); } // print debugging information // if (debug_level_d >= Integral::DETAILED) { symbol_hash.debug(L"symbol_hash"); } // we keep track of the current ngram prefix. since the input is sorted, // this makes it easier to know when to trigger events in the read // process // VectorLong curr_prefix(order_d); boolean prefix_changed = false; long prefix_items = 0; // read the instance of the object from the Sof file // if (!sof_a.find(name_a, tag_a)) { return false; } // read the first format name // sof_a.gets(buffer, Sof::BUFFER_SIZE); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // format: NGRAM_ARPA // if (buffer.eq(L"format = \"NGRAM_ARPA\";")) { // read the ngram information such as the number of unigrams, // bigrams etc // while (sof_a.gets(buffer, Sof::BUFFER_SIZE)) { // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // get the first char // chr = buffer(0); // print debugging information // if (debug_level_d >= Integral::DETAILED) { chr.debug(L"chr"); } // ignore comment lines if the first char is '#' // if (chr.eq(L'#') || buffer.length() == 0) { // do nothing // continue; } // read "\data\" token // else if (chr.eq(L'\\')) { // read the ngram counts // if (buffer.compare(L"\\data\\") == Integral::EQUAL) { // read the ngram counts // for (long i = 0; i < order_d; i++) { sof_a.gets(buffer, Sof::BUFFER_SIZE); buffer.trimLeft(); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // get "ngram" token // pos = 0; buffer.tokenize(buf, pos, delimiter); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buf.debug(L"buf"); } if (buf.ne(L"ngram")) { buf.debug(L"TAG(ngram)"); return Error::handle(name(), L"load", ERR_TAG, __FILE__, __LINE__); } // get the number of the gram // buffer.tokenize(buf, pos, delimiter); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buf.debug(L"buf"); } buf.get(gram_num); if (gram_num > order_d) { value.assign(gram_num); output.debugStr(name(), empty_str, L"gram_num", value); return Error::handle(name(), L"load", ERR_ORDER, __FILE__, __LINE__); } index = gram_num - 1; // get the size of the gram // pos++; buffer.deleteRange(0, pos); buffer.get(tmp_long); gram_size(index) = tmp_long; } // initilize the hash table for unigram // gram_hash_a.setCapacity((long)gram_size(0)); } // end if data // read end information // else if (buffer.compare(L"\\end\\")) { break; } // otherwise this is an ngram list: "n-gram" // else { // we may need to clean up after an ngram read // // see if the prefix has changed. if so, then dump the current ngram // data // // if there are items for this prefix then add them // if (prefix_items > 0) { // store the items in the list to hash table // num_gram_read(curr_order) += symbol_list.length(); storeToHash(gram_hash_a, symbol_list, prefix_node); } // reset variables // prefix_items = 0; // reset the prefix node // prefix_node = (NGramNode*)NULL; // clear the prefix // curr_prefix.setLength(gram_num); curr_prefix.clear(Integral::RETAIN); prefix_changed = true; curr_prefix.assign(-1); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // get the order of the current ngram list // curr_order++; buffer.deleteRange(0, 1); buffer.get(gram_num); if ((gram_num > order_d) && (gram_num == curr_order - 1)) { value.assign(gram_num); output.debugStr(name(), empty_str, L"gram_num", value); return Error::handle(name(), L"load", ERR_ORDER, __FILE__, __LINE__); } index = gram_num - 1; } } // end else if char is "\" // otherwise this is a data line // else { // break flag - if the particular ngram is not valid then we just skip // that line // boolean break_flag = false; // reset flag // flag = true; // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // set the lm score // pos = 0; buffer.tokenize(buf, pos, delimiter); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buf.debug(L"buf"); } buf.get(lmscore); buffer.deleteRange(0, pos); buffer.trimLeft(); // as the score is in log10 form, convert it to log_e // lmscore *= Integral::LN10; // reset the current hash table pointer // curr_hash = &gram_hash_a; // check if there are any history words and read them // for (long i = 0; i < index; i++) { // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // read a word // pos = 0; buffer.tokenize(buf, pos, delimiter); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buf.debug(L"buf"); } buffer.deleteRange(0, pos); buffer.trimLeft(); // get the corresponding word from the lexicon // symbol_index = symbol_hash.get(buf); if (symbol_index == NULL) { flag = true; buf.debug(L"err_symbol"); return Error::handle(name(), L"load", ERR_SYMBOL, __FILE__, __LINE__, Error::WARNING); } // check the prefix // if (*symbol_index != curr_prefix(i)) { curr_prefix(i) = *symbol_index; prefix_changed = true; } // get the ngram node corresponding to this word at this // level // if (curr_hash != NULL) { history_node = curr_hash->get(*symbol_index); curr_hash = history_node->getNextGram(); } else { return Error::handle(name(), L"load", ERR_HASH, __FILE__, __LINE__); } // if the ngram lookup failed then break out and do not process this // ngram (it is invalid) // if (history_node == (NGramNode*)NULL) { break_flag = true; break; } } // end for loop // see if the prefix has changed. if so, then dump the current ngram // data // if (prefix_changed) { // if there are items for this prefix then add them // if (prefix_items > 0) { // store the items in the list to hash table // num_gram_read(curr_order) += symbol_list.length(); storeToHash(gram_hash_a, symbol_list, prefix_node); } // reset variables // prefix_items = 0; prefix_changed = false; // reset the prefix node //
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -