📄 ngrm_read_0.cc

📁 这是处理语音信号的程序
💻 CC
字号:
// file: ngrm_read_0.cc//// system include files//#include <string.h>#include <ctype.h>// isip include files//#include "ngram.h"#include "ngram_constants.h"#include <link_list.h>#include <link_node.h>static int compare_nodes_cc(const void* first, const void* second) {  return Ngram::ngnode_compare_cc((Ngram_node**)first, (Ngram_node**)second);}// method: read_ngram_cc//// arguments://  FILE* fp_a: (input) input ngram file//  Hash_table* lexicon: (input) the lexicon hash table//// return: a logical_1 indicating status//// this method reads in the ngram from a file//logical_1 Ngram::read_ngram_cc(FILE* fp_a, Hash_table* lexicon_a) {  // define some local variables  //  Memory_manager* mmgr = Link_list::get_manager_cc();  Link_list tmp_symbol_list;  char_1* buffer = new char_1[ISIP_MAX_STRING_LENGTH];  char_1* bufpos = (char_1*)NULL;  char_1* delimiter = NGRAM_DELIMITER;    logical_1 flag = ISIP_TRUE;  int_4 index = (int_4)0;  int_4 num_grams = (int_4)0;  Word* word = (Word*)NULL;  Ngram_node** ngnode = (Ngram_node**)NULL;  Hash_cell* hcell = (Hash_cell*)NULL;  float_4 backoff = (float_4)0;  float_4 lmscore = NGRAM_NODE_DEFAULT_SCORE;  // define the ngram lists  //  if (ngram_lists_d == (Ngram_list**)NULL) {    ngram_lists_d = new Ngram_list*[ngram_order_d];  }  for (int_4 i = 0; i < ngram_order_d; i++) {    ngram_lists_d[i] = (Ngram_list*)NULL;  }    // initialize numbers  //  int_4* num_read = new int_4[ngram_order_d];  for (int_4 i = 0; i < ngram_order_d; i++) {    num_read[i] = (int_4)0;  }  // we keep track of the current ngram prefix. since the input is sorted,  // this makes it easier to know when to trigger events in the read  // process  //  Word** curr_prefix = new Word*[ngram_order_d];  memset(curr_prefix, 0, ngram_order_d * sizeof(Word*));  logical_1 prefix_changed = ISIP_FALSE;  int_4 prefix_items = 0;  Ngram_node** prefix_node = (Ngram_node**)NULL;    // read the ngram information such as the number of unigrams,  // bigrams etc  //  while (fscanf(fp_a, "%s", buffer) != EOF) {    // ignore comment lines    //    if (buffer[0] == (char_1)'#') {            // do nothing      //      fscanf(fp_a, "%[^\n]", buffer);      fscanf(fp_a, "%[\n]", buffer);    }    // read data    //    else if (buffer[0] == (char_1)'\\') {      // advance pointer      //      bufpos = buffer;      bufpos++;            // read the ngram counts      //      if (strcmp((char*)bufpos, (char*)NGRAM_DATA_TAG) == 0) {	// prepare to read the next line	//	fscanf(fp_a, "%[^\n]", buffer);	fscanf(fp_a, "%[\n]", buffer);	// read the ngram counts	//	for (int_4 i = 0; i < ngram_order_d; i++) {	  fscanf(fp_a, "%[^\n]", buffer);	  bufpos = (char_1*)strtok((char*)buffer, ISIP_STRING_SPACE);	  while (isspace((char)(*bufpos))) {	    bufpos++;	  }	  bufpos = (char_1*)strtok((char*)NULL, (char*)delimiter);	  index = (int_4)atoi((char*)bufpos) - (int_4)1;	  bufpos = (char_1*)strtok((char*)NULL, (char*)delimiter);	  num_grams = (int_4)atoi((char*)bufpos);	  fscanf(fp_a, "%[\n]", buffer);	  // set up the corresponding ngram list	  //	  ngram_lists_d[index] = new Ngram_list(num_grams);	}      } // end if data      // read lm parameters if specified      //      else if (strcmp((char*)bufpos, (char*)NGRAM_LM_TAG) == 0) {	// prepare to read the next line	//	fscanf(fp_a, "%[^\n]", buffer);	fscanf(fp_a, "%[\n]", buffer);		// read the lm scale	//	fscanf(fp_a, "%[^\n]", buffer);	bufpos = (char_1*)strtok((char*)buffer, (char*)delimiter);	bufpos = (char_1*)strtok((char*)NULL, (char*)delimiter);	lmscale_d = (float_4)atof((char*)bufpos);	fscanf(fp_a, "%[\n]", buffer);		// read the word penalty	//	fscanf(fp_a, "%[^\n]", buffer);	bufpos = (char_1*)strtok((char*)buffer, (char*)delimiter);	bufpos = (char_1*)strtok((char*)NULL, (char*)delimiter);	wdpenalty_d = (float_4)atof((char*)bufpos);	fscanf(fp_a, "%[\n]", buffer);      }            // otherwise this is an ngram list      //      else {	// we may need to clean up after an ngram read	//	// see if the prefix has changed. if so, then dump the current ngram	// data	//	// if there are items for this prefix then add them	//	if (prefix_items > 0) {	  	  // sort the items in the list	  //	  rearrange_nodes_cc(index, prefix_items, tmp_symbol_list,			     prefix_node, num_read);	}	// reset variables	//	prefix_items = 0;		// reset the prefix node	//	prefix_node = (Ngram_node**)NULL;		// clear the prefix	//	memset(curr_prefix, 0, ngram_order_d * sizeof(Ngram_node*));	prefix_changed = ISIP_TRUE;		// get the order of the current ngram list	//	bufpos = (char_1*)strtok((char*)buffer, "-");	bufpos = buffer;	bufpos++;	index = (int_4)atoi((char*)bufpos) - 1;		// prepare to read the next line	//	fscanf(fp_a, "%[^\n]", buffer);	fscanf(fp_a, "%[\n]", buffer);      }    } // end else if char is "\"        // otherwise this is a data line    //    else {      // break flag - if the particular ngram is not valid then we just skip      // that line      //      logical_1 break_flag = ISIP_FALSE;            // reset flag      //      flag = ISIP_TRUE;            // set the lm score      //      lmscore = (float_4)atof((char*)buffer);      // as the score is in log10 form, convert it to log_e      //      lmscore *= NGRAM_LOGTEN;      // reset the ngram node      //      ngnode = (Ngram_node**)NULL;            // check if there are any history words and read them      //      for (int_4 i = 0; i < index; i++) {		// read a word	//	fscanf(fp_a, "%s", buffer);		// get the corresponding word from the lexicon	//	hcell = lexicon_a->hash_lookup_cc(buffer);	if (hcell == (Hash_cell*)NULL) {	  flag = ISIP_FALSE;	  fprintf(stdout, "%s::read_ngram_cc : word %s not in lexicon\n",		  NGRAM_CLASS_NAME, buffer);	}	else {	  word = (Word*)(hcell->get_item_cc());	}	// check the prefix	//	if (word != curr_prefix[i]) {	  curr_prefix[i] = word;	  prefix_changed = ISIP_TRUE;	}		// get the ngram node corresponding to this word at this	// level	//	ngram_lists_d[i]->get_node_cc(word, ngnode);	// if the ngram lookup failed then break out and do not process this	// ngram (it is invalid)	//	if (ngnode == (Ngram_node**)NULL) {	  break_flag = ISIP_TRUE;	  break;	}	      } // end for loop      // see if the prefix has changed. if so, then dump the current ngram      // data      //      if (prefix_changed == ISIP_TRUE) {	// if there are items for this prefix then add them	//	if (prefix_items > 0) {	  // sort the items in the list	  //	  rearrange_nodes_cc(index, prefix_items, tmp_symbol_list,			     prefix_node, num_read);	}		// reset variables	//	prefix_items = 0;	prefix_changed = ISIP_FALSE;		// reset the prefix node	//	prefix_node = ngnode;      }            if (break_flag != ISIP_TRUE) {	// read the current word	//	fscanf(fp_a, "%s", buffer);      	// get the corresponding word from the lexicon	//	hcell = lexicon_a->hash_lookup_cc(buffer);	if (hcell == (Hash_cell*)NULL) {	  flag = ISIP_FALSE;	  fprintf(stdout, "%s::read_ngram_cc : word %s not in lexicon\n",		  NGRAM_CLASS_NAME, buffer);	}	else {	  word = (Word*)(hcell->get_item_cc());	}		// read the backoff score if any	//	backoff = (float_4)0;	fscanf(fp_a, "%[^\n]", buffer);	bufpos = (char_1*)strtok((char*)buffer, ISIP_STRING_SPACE);	if (bufpos != (char_1*)NULL) {	  while (isspace((char)(*bufpos))) {	    bufpos++;	  }	  backoff = (float_4)atof((char*)bufpos);	  	  // as the score is in log10 form, convert it to log_e	  //	  backoff *= NGRAM_LOGTEN;		}		// if the current word exists in the lexicon	//	if (flag == ISIP_TRUE) {	  	  // create an n-gram node	  //	  Ngram_node* new_ngram = mmgr->new_ngram_cc();	  new_ngram->set_word_cc(word);	  new_ngram->set_lmscore_cc(lmscore);	  new_ngram->set_backoff_cc(backoff);	  // put the node in the list	  //	  Link_node* new_node = mmgr->new_node_cc();	  new_node->set_item_cc(new_ngram);	  tmp_symbol_list.push_cc(new_node);	  // increment the number of items for this prefix	  //	  prefix_items++;	}		// prepare to read next line	//	fscanf(fp_a, "%[\n]", buffer);      }      else {	// read to the end of line	//	fscanf(fp_a, "%[^\n]", buffer);	fscanf(fp_a, "%[\n]", buffer);      }    } // end else this is data line  } // end while  // we may need to clean up after an ngram read  //  // see if the prefix has changed. if so, then dump the current ngram  // data  //  // if there are items for this prefix then add them  //  if (prefix_items > 0) {        // sort the items in the list    //    rearrange_nodes_cc(index, prefix_items, tmp_symbol_list, prefix_node,		       num_read);  }  // reset variables  //  prefix_items = 0;	  // reset the prefix node  //  prefix_node = (Ngram_node**)NULL;	  // clear the prefix  //  memset(curr_prefix, 0, ngram_order_d * sizeof(Ngram_node*));  prefix_changed = ISIP_TRUE;	  // confirm counts and adjust memory in ngram list if needed  //  for (int_4 i = 0; i < ngram_order_d; i++) {    // if the read count is larger flag error    if (num_read[i] > ngram_lists_d[i]->get_num_cc()) {      error_handler_cc((char_1*)"read_ngram_cc",		       (char_1*)"Mismatch in number of ngrams read.");    }    // otherwise remove space out of vocabulary words    //    else if (num_read[i] < ngram_lists_d[i]->get_num_cc()) {      ngram_lists_d[i]->adjust_num_cc(num_read[i]);    }  }  // free memory  //  delete [] num_read;  delete [] buffer;  delete [] curr_prefix;  // exit gracefully  //  return ISIP_TRUE;}// method: rearrange_nodes_cc//// arguments://  int_4 index: (input) the list to add the item to//  int_4 prefix_items: (input) number of items in the list//  Link_list& tmp_symbol_list: (input/output) nodes to add//  Ngram_node** prefix_node: (input) node to attach the list of nodes to.//  int_4* num_read_a; (input) number of items read at each ngram level//// return: an int_4 indicating the number of items transferred//// this method sorts the items in the list by Word ID and then attaches them// to the prefix node in the ngram list//int_4 Ngram::rearrange_nodes_cc(int_4 index_a,				int_4 prefix_items_a,				Link_list& tmp_symbol_list_a,				Ngram_node** prefix_node_a,				int_4* num_read_a) {  // declare local variables  //  Memory_manager* mmgr = Link_list::get_manager_cc();  Ngram_node** ngnode_array = new Ngram_node*[prefix_items_a];  int_4 loc = 0;  Link_node* tmp_node = (Link_node*)NULL;  // remove the items from the list and put them in an array so we  // can sort them  //  while ((tmp_node = tmp_symbol_list_a.pop_cc()) != (Link_node*)NULL) {	        // insert the item in the array    //    ngnode_array[loc++] = (Ngram_node*)tmp_node->get_item_cc();        // delete the node    //    mmgr->delete_cc(tmp_node);  }	    // sort the array  //  qsort(ngnode_array, prefix_items_a, sizeof(Ngram_node*),	compare_nodes_cc);	    // add the nodes to the list one by one  //  for (int_4 jj = 0; jj < prefix_items_a; jj++) {	        Ngram_node* tmp_node = ngnode_array[jj];	        // add each node to the list    //    ngram_lists_d[index_a]->add_node_cc(num_read_a[index_a], 					tmp_node->get_word_cc(),					prefix_node_a,					tmp_node->get_lmscore_cc(),					tmp_node->get_backoff_cc());	        // delete the ngram node    //    mmgr->delete_cc(tmp_node);	        // increment count    //    num_read_a[index_a]++;  }  // clean up memory  //  delete [] ngnode_array;  // set the number of next grams for the prefix node  //  if (prefix_node_a != (Ngram_node**)NULL) {    (*prefix_node_a)->set_num_next_nodes_cc(prefix_items_a);  }  // exit gracefully  //  return ISIP_TRUE;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -