📄 ngmp_05.cc

📁 这是一个从音频信号里提取特征参量的程序
💻 CC
📖 第 1 页 / 共 2 页
字号:
12 下一页
// file: $isip/class/mmedia/NGramParser/ngmp_05.cc// version: $Id: ngmp_05.cc,v 1.6 2002/07/10 16:23:24 alphonso Exp $//// isip include files//#include "NGramParser.h"#include <Sof.h>// method: load//// arguments://  Sof& sof: (input) input ngram source file//  HashTable<Long, NGramNode>& gram_hash: (output) output gram hash table//  const Vector<String>& symbol_table: (input) symbol table mapping index//  const long& tag: (input) sof object instance name//  const String& name: (input) sof object instance name//// return: a boolean indicating status//// this method reads in the ngram from a file with various format//boolean NGramParser::load(Sof& sof_a, HashTable<Long, NGramNode>& gram_hash_a,			  const Vector<String>& symbol_table_a,			  const long& tag_a, const String& name_a) {  // define some local variables  //  String buffer, buf;  Char chr;  VectorLong gram_size(order_d);   // the size of each gram  VectorLong num_gram_read(order_d);   // the size of each gram  Vector<String> symbol(1);  // N-gram symbols  const Long *symbol_index;      // indices of N-gram symbols  SingleLinkedList<NGramNode> symbol_list;   // the list of ngram nodes  NGramNode ngrm_node;  NGramNode* prefix_node = NULL;  NGramNode* history_node = NULL;  SysString delimiter(L" \t=");  HashTable<Long, NGramNode>* curr_hash = &gram_hash_a;  HashTable<String, Long> symbol_hash;    long gram_num;  long pos, index = 0;  boolean flag = true;  float backoff = NGramNode::DEF_BACKOFF;  float lmscore = NGramNode::DEF_LM_SCORE;  long tmp_long;  long curr_order = -1;   // current order of grammar reading: 0, 1, 2, ..  // debug variables  //  String value, output, empty_str;  // initialize numbers  //  num_gram_read.assign((long)0);    // get hash table which maps String to Long  //  long len = symbol_table_a.length();  Long symbol_id;  symbol_hash.setCapacity(len);  for (long i = 0; i < len; i++) {    symbol_id = i;    symbol_hash.insert(symbol_table_a(i), &symbol_id);  }   // print debugging information  //  if (debug_level_d >= Integral::DETAILED) {    symbol_hash.debug(L"symbol_hash");  }    // we keep track of the current ngram prefix. since the input is sorted,  // this makes it easier to know when to trigger events in the read  // process  //  VectorLong curr_prefix(order_d);  boolean prefix_changed = false;  long prefix_items = 0;    // read the instance of the object from the Sof file  //  if (!sof_a.find(name_a, tag_a)) {    return false;  }  // read the first format name  //  sof_a.gets(buffer, Sof::BUFFER_SIZE);  // print debugging information  //  if (debug_level_d >= Integral::DETAILED) {    buffer.debug(L"buffer");  }    //  format: NGRAM_ARPA  //  if (buffer.eq(L"format = \"NGRAM_ARPA\";")) {        // read the ngram information such as the number of unigrams,    // bigrams etc    //    while (sof_a.gets(buffer, Sof::BUFFER_SIZE)) {      // print debugging information      //      if (debug_level_d >= Integral::DETAILED) {	buffer.debug(L"buffer");      }        // get the first char      //      chr = buffer(0);      // print debugging information      //      if (debug_level_d >= Integral::DETAILED) {	chr.debug(L"chr");      }        // ignore comment lines if the first char is '#'      //      if (chr.eq(L'#') || buffer.length() == 0) {		// do nothing	//	continue;      }            // read "\data\" token      //      else if (chr.eq(L'\\')) {		// read the ngram counts	//	if (buffer.compare(L"\\data\\") == Integral::EQUAL) {	  	  // read the ngram counts	  //	  for (long i = 0; i < order_d; i++) {	    sof_a.gets(buffer, Sof::BUFFER_SIZE);	    buffer.trimLeft();	    // print debugging information	    //	    if (debug_level_d >= Integral::DETAILED) {	      buffer.debug(L"buffer");	    }  	    // get "ngram" token	    //	    pos = 0;	    buffer.tokenize(buf, pos, delimiter);	    // print debugging information	    //	    if (debug_level_d >= Integral::DETAILED) {	      buf.debug(L"buf");	    }	    	    if (buf.ne(L"ngram")) {	      buf.debug(L"TAG(ngram)");	      return Error::handle(name(), L"load", ERR_TAG,				   __FILE__, __LINE__);	    }  	    // get the number of the gram	    //	    buffer.tokenize(buf, pos, delimiter);	    // print debugging information	    //	    if (debug_level_d >= Integral::DETAILED) {	      buf.debug(L"buf");	    }	    	    buf.get(gram_num);	    if (gram_num > order_d) {	      value.assign(gram_num);	      output.debugStr(name(), empty_str, L"gram_num", value);	      return Error::handle(name(), L"load", ERR_ORDER,				   __FILE__, __LINE__);	    }	    index = gram_num - 1;	    	    // get the size of the gram	    //	    pos++;	    buffer.deleteRange(0, pos);	    buffer.get(tmp_long);	    gram_size(index) = tmp_long;	  }	  	  // initilize the hash table for unigram	  //	  gram_hash_a.setCapacity((long)gram_size(0));	} // end if data	// read end information	//	else if (buffer.compare(L"\\end\\")) {	  break;	}		// otherwise this is an ngram list: "n-gram"	//	else {	  	  // we may need to clean up after an ngram read	  //	  // see if the prefix has changed. if so, then dump the current ngram	  // data	  //	  // if there are items for this prefix then add them	  //	  if (prefix_items > 0) {	    	    // store the items in the list to hash table	    //	    num_gram_read(curr_order) += symbol_list.length();	    storeToHash(gram_hash_a, symbol_list, prefix_node);	  }	  	  // reset variables	  //	  prefix_items = 0;	  	  // reset the prefix node	  //	  prefix_node = (NGramNode*)NULL;	  	  // clear the prefix	  //	  curr_prefix.setLength(gram_num);	  curr_prefix.clear(Integral::RETAIN);	  prefix_changed = true;	  curr_prefix.assign(-1);	  // print debugging information	  //	  if (debug_level_d >= Integral::DETAILED) {	    buffer.debug(L"buffer");	  }	  	  // get the order of the current ngram list	  //	  curr_order++;	  buffer.deleteRange(0, 1);	  buffer.get(gram_num);	  if ((gram_num > order_d) && (gram_num == curr_order - 1)) {	    value.assign(gram_num);	    output.debugStr(name(), empty_str, L"gram_num", value);	    return Error::handle(name(), L"load", ERR_ORDER,				 __FILE__, __LINE__);	  }	  index = gram_num - 1;	}      } // end else if char is "\"            // otherwise this is a data line      //      else {	// break flag - if the particular ngram is not valid then we just skip	// that line	//	boolean break_flag = false;      	// reset flag	//	flag = true;	// print debugging information	//	if (debug_level_d >= Integral::DETAILED) {	  buffer.debug(L"buffer");	}		// set the lm score	//	pos = 0;	buffer.tokenize(buf, pos, delimiter);	// print debugging information	//	if (debug_level_d >= Integral::DETAILED) {	  buf.debug(L"buf");	}	    	buf.get(lmscore);	buffer.deleteRange(0, pos);	buffer.trimLeft();		// as the score is in log10 form, convert it to log_e	//	lmscore *= Integral::LN10;		// reset the current hash table pointer	//	curr_hash = &gram_hash_a;      	// check if there are any history words and read them	//	for (long i = 0; i < index; i++) {	  // print debugging information	  //	  if (debug_level_d >= Integral::DETAILED) {	    buffer.debug(L"buffer");	  }	    	  // read a word	  //	  pos = 0;	  buffer.tokenize(buf, pos, delimiter);	  // print debugging information	  //	  if (debug_level_d >= Integral::DETAILED) {	    buf.debug(L"buf");	  }	    	  buffer.deleteRange(0, pos);	  buffer.trimLeft();	  	  // get the corresponding word from the lexicon	  //	  symbol_index = symbol_hash.get(buf);	  if (symbol_index == NULL) {	    flag = true;	    buf.debug(L"err_symbol");	    return Error::handle(name(), L"load", ERR_SYMBOL,				 __FILE__, __LINE__, Error::WARNING);	  }	  	  // check the prefix	  //	  if (*symbol_index != curr_prefix(i)) {	    curr_prefix(i) = *symbol_index;	    prefix_changed = true;	  }	  	  // get the ngram node corresponding to this word at this	  // level	  //	  if (curr_hash != NULL) {	    history_node = curr_hash->get(*symbol_index);	    curr_hash = history_node->getNextGram();	  }	  else {	    return Error::handle(name(), L"load", ERR_HASH,				 __FILE__, __LINE__);	  }	  	  // if the ngram lookup failed then break out and do not process this	  // ngram (it is invalid)	  //	  if (history_node == (NGramNode*)NULL) {	    break_flag = true;	    break;	  }	} // end for loop	// see if the prefix has changed. if so, then dump the current ngram	// data	//	if (prefix_changed) {	  	  // if there are items for this prefix then add them	  //	  if (prefix_items > 0) {	    // store the items in the list to hash table	    //	    num_gram_read(curr_order) += symbol_list.length();	    storeToHash(gram_hash_a, symbol_list, prefix_node);	  }	  	  // reset variables	  //	  prefix_items = 0;	  prefix_changed = false;		  // reset the prefix node	  //
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -