📄 hmm.cc

📁 隐马尔科夫模型对文本信息进行抽取利用C＋＋实现
💻 CC
📖 第 1 页 / 共 2 页
字号:
上一页 12
}/************************* FILE OPERATIONS ****************************/// set model parameters from a file.//void HMM::load_model(char* filename){  //read in param file  FILE *param_file;  double prob;  param_file = fopen(filename,"r");  if (!param_file) {    cerr << "ERROR: can not open " << filename << ".\n";    exit(-1);  }  if (fscanf(param_file,"states: %d\n",&max_states)!=1) {    cerr << "ERROR: Problem reading states: field of " << filename << "\n";    exit(-1);  }      if (fscanf(param_file,"symbols: %d\n",&max_symbols)!=1 ) {    cerr << "ERROR: Problem reading symbols: field of " << filename << "\n";    exit(-1);  }  // allocate space for model parameters  alloc_model_matrices();  for (int i=0; i < max_states; i++) {    fscanf(param_file,"%lf ",&prob);    States[i]->set_recur_trans(prob);    for (int j=0; j < max_symbols; j++) {      fscanf(param_file,"%lf ",&prob);      States[i]->set_recur_out(j,prob);    }    fscanf(param_file,"\n");    fscanf(param_file,"%lf ",&prob);    States[i]->set_next_trans(prob);    for (j=0; j < max_symbols; j++) {      fscanf(param_file,"%lf ",&prob);      States[i]->set_next_out(j,prob);    }    fscanf(param_file,"\n");  }  fclose(param_file);}// figure out number of symbols and number of strings in file// and load strings matrix and string_len array.//int HMM::load_string_matrix(char* filename){  int str,sym,tmp,val;  int symbol_count;  // open file  FILE *from;  char line[MAX_LINE];  from = fopen(filename,"r");  symbol_count=0;  for (num_strings=0; fscanf(from,"%[^\n]\n",line)!=EOF; num_strings++) {    for (sym=0,tmp=99; tmp>1; sym++) {      tmp=sscanf(line,"%d %[0-9 ]",&val,line);    }    if (symbol_count < sym)      symbol_count=sym;  }  cout << "\nmax sequence length = " << symbol_count << "\n";  cout << "number of strings = " << num_strings << "\n";  rewind(from);    // allocate matrix  strings = new int* [num_strings];  string_len = new int [num_strings];  for (int i=0; i<num_strings; i++) {    strings[i]=new int [symbol_count];    for (sym=0; sym<symbol_count; sym++) // initialize array to -1      strings[i][sym]=-1;  }  // load values into matrix  diff_len=FALSE;  int tmp_max_symbols=0;  for (str=0; fscanf(from,"%[^\n]\n",line)!=EOF; str++) {    for (sym=0,tmp=99; tmp>1; sym++) {      tmp=sscanf(line,"%d %[0-9 ]",&val,line);      strings[str][sym]=val;      if (tmp_max_symbols<val+1)	tmp_max_symbols=val+1;    }    string_len[str]=sym;    if (sym!=symbol_count)      diff_len=TRUE;  }  if (diff_len == TRUE)    cout << "\nStrings of different lengths\n";  // change max_symbols based on length of file  if (tmp_max_symbols > max_symbols) {    cout << "\nsetting max_symbols from file to "<<tmp_max_symbols<<"\n";    max_symbols=tmp_max_symbols;  }  fclose(from);  return symbol_count;}/************************* CALCULATIONS ****************************/// rescale alpha values (from Rabiner).// void HMM::rescale_alphas(int col){  scaling_factors[col] = 0.0;  for (int i=0; i < max_states; i++) {    scaling_factors[col] += alpha[col][i];  }  // rescale all the alpha's and smooth them  for (i=0; i <  max_states; i++) {    alpha[col][i] /= scaling_factors[col];  }}// rescale beta values after rescaling alphas.//void HMM::rescale_betas(int col){  // rescale all the beta's w alpha's factors  for (int i=0; i < max_states; i++) {    beta[col][i] /= scaling_factors[col];  }}// calculate alpha trellis and return final alpha value (recognition prob).// double HMM::alpha_F(int* symbol_array, int symbol_count){  double accum;  int i,j;  symbol_count=(symbol_count<0?trellis_width-1:symbol_count);  // clear the trellis and set the first column  for (i=0; i < trellis_width; i++)    for (j=0; j < max_states; j++) {      alpha[i][j] = 0.0;    }  alpha[0][0] = 1.0;  // first state starts with alpha = 1.0  rescale_alphas(0);  // first col rescale is div by one  // calculate the alphas for rest of cols  for (i = 0; i < symbol_count; i++) { 		    for (j=max_states-1; j > 0; j--) {      accum = alpha[i][j] * 	States[j]->set_recur_trans() *	  States[j]->set_recur_out(symbol_array[i]);      alpha[i+1][j] = alpha[i][j-1] * 	States[j-1]->set_next_trans() * 	  States[j-1]->set_next_out(symbol_array[i]) + 	    accum;    }    alpha[i+1][0] = alpha[i][0] * States[0]->set_recur_trans() *      States[0]->set_recur_out(symbol_array[i]);    rescale_alphas(i+1);  }  return (alpha[symbol_count][max_states-1]);}// calculate beta trellis and return initial beta value,// instead of going bottom to top, go top to bottom; the bottom// is the exception this time.  we look at transistions from current// state (single) to the next states (plural).//double HMM::beta_I(int* symbol_array, int symbol_count)   {     double accum;     symbol_count=(symbol_count<0?trellis_width-1:symbol_count);          // clear the trellis and set the last column     for (int i=0; i < trellis_width; i++)       for (int j=0; j < max_states; j++) {	 beta[i][j] = 0.0;       }     beta[symbol_count][max_states-1] = 1.0;  /* final beta is 1.0 */     rescale_betas(symbol_count); // this is really div by 1 so no effect     // begin setting all the cols except last one     for (int t = symbol_count-1; t >= 0; t--) {       for (int j=0; j < max_states-1; j++) {	 accum = beta[t+1][j] * 	   States[j]->set_recur_trans() *	     States[j]->set_recur_out(symbol_array[t]);	 beta[t][j] = beta[t+1][j+1] * 	   States[j]->set_next_trans() * 	     States[j]->set_next_out(symbol_array[t]) + 	       accum;       }       beta[t][max_states-1] = beta[t+1][max_states-1] *	 States[max_states-1]->set_recur_trans() *	   States[max_states-1]->set_recur_out(symbol_array[t]);       rescale_betas(t);     }     return (beta[0][0]);   }// compute gamma values and fill gamma tables for entire trellis.//void HMM::compute_gamma(int* symbol_array, int symbol_count){  symbol_count=(symbol_count<0?trellis_width-1:symbol_count);  // clear gammas  for (int i=0; i < symbol_count; i++)    for (int j=0; j < max_states; j++) {	gamma_recur[i][j] = 0.0;	gamma_next[i][j] = 0.0;      }  // calc the gamma table  for (int t = 0; t < symbol_count; t++) {    for (int i=0,j=1; i < max_states-1; i++,j++) {      gamma_next[t][i] = (alpha[t][i]*			  States[i]->set_next_trans() *			  States[i]->set_next_out(symbol_array[t]) *			  beta[t+1][j]) /			    alpha[symbol_count][max_states-1];      gamma_recur[t][i] = (alpha[t][i]* 			   States[i]->set_recur_trans() * 			   States[i]->set_recur_out(symbol_array[t]) *			   beta[t+1][i]) /			     alpha[symbol_count][max_states-1];      }    gamma_recur[t][max_states-1] = (alpha[t][max_states-1] * 				    States[max_states-1]->set_recur_trans() * 		       States[max_states-1]->set_recur_out(symbol_array[t]) *			 	    beta[t+1][max_states-1]) /				      alpha[symbol_count][max_states-1];    gamma_next[t][max_states-1] = 0.0;  }}// this is the numerator of the a_ij reestimate and denom of b_ij reestimate,// this func assumes gammas have been calc'd.//double HMM::a_numer(int i, int j, int symbol_count){  double sum = 0.0;  symbol_count=(symbol_count<0?trellis_width-1:symbol_count);  for(int t=0; t < symbol_count; t++)    {      if (i==j) {	sum += gamma_recur[t][i];      }      else if ( i < max_states-1 ) {	sum += gamma_next[t][i];      } else {	cerr << "WARNING: gamma_next[t]["<<i<<"] shouldn't be requested\n";	break;      }    }  return sum;}// this is the denominator of the a_ij reestimate.//double HMM::a_denom(int i, int j, int symbol_count){  symbol_count=(symbol_count<0?trellis_width-1:symbol_count);  // j not used in this func - pass for consistency  double sum = 0.0;  for(int t=0; t < symbol_count; t++)    if (i<max_states-1)      sum += gamma_recur[t][i] + gamma_next[t][i];    else      sum += gamma_recur[t][i];  return sum;}// this is the numerator of the b_ij reestimate.// double HMM::b_numer(int i, int j, int sym, int *symbol_array,		    int symbol_count){  symbol_count=(symbol_count<0?trellis_width-1:symbol_count);  double sum = 0.0;  for(int t=0; t < symbol_count; t++) {    if ( symbol_array[t] == sym )       {	if (i==j)	  sum += gamma_recur[t][i];	else  {	  if (i < max_states-1)	    sum += gamma_next[t][i];	  else {	    cerr << "WARNING: gamma_next[t]["<<i<<"] shouldn't be requested\n";	    break;	  }	}      }  }  return sum;}// set cumulative ab counts from an array of equal length strings.//double HMM::set_cumulative_ab_counts(){  int *symbol_array;  double alpha_tot;  // dynamic allocation of arrays (ANSI)  symbol_array = new int [trellis_width-1];  // clear cumulative sum arrays  for(int i=0; i < max_states; i++) {    a_numer_sum_recur[i] = 0.0;    a_denom_sum_recur[i] = 0.0;    a_numer_sum_next[i] = 0.0;    a_denom_sum_next[i] = 0.0;    for (int j=0; j < max_symbols; j++) {      b_numer_sum_recur[i][j] = 0.0;      b_numer_sum_next[i][j] = 0.0;    }  }  alpha_tot=0.0;  // loop all the strings calc a,b sums  for(int s=0; s < num_strings; s++)    {      // set the symbol array to string from matrix      for (int i=0; i < trellis_width-1; i++) {	symbol_array[i] = strings[s][i];      }      // fill the alpha matrix      alpha_tot += alpha_F(symbol_array);      // fill the beta matrix      beta_I(symbol_array);      // fill the gamma_next and gamma_recur matrices      compute_gamma(symbol_array);      int j;      for (i=0,j=1; j < max_states; i++,j++) {	  a_numer_sum_recur[i] += a_numer(i,i);	  a_numer_sum_next[i] += a_numer(i,j);	  a_denom_sum_recur[i] += a_denom(i,i);	  a_denom_sum_next[i] += a_denom(i,j);	  for (int k=0; k < max_symbols; k++) {	    b_numer_sum_recur[i][k] += b_numer(i,i,k,symbol_array);	    b_numer_sum_next[i][k] += b_numer(i,j,k,symbol_array);	  }	}      // compute recurrent probs for last state      i = max_states-1;      a_numer_sum_recur[i] += a_numer(i,i);      a_denom_sum_recur[i] += a_denom(i,i);      for (int k=0; k < max_symbols; k++)	b_numer_sum_recur[i][k] += b_numer(i,i,k,symbol_array);    }  delete [] symbol_array;  return(alpha_tot);}// set ab count from one string.//double HMM::set_ab_counts(int* symbol_array, int symbol_count){  double alpha_tot;  int i,j;  // fill the alpha matrix  alpha_tot = alpha_F(symbol_array,symbol_count);    // fill the beta matrix  beta_I(symbol_array,symbol_count);  // fill the gamma_next and gamma_recur matrices  compute_gamma(symbol_array,symbol_count);  for (i=0,j=1; j < max_states; i++,j++) {    a_numer_sum_recur[i] = a_numer(i,i,symbol_count);    a_numer_sum_next[i] = a_numer(i,j,symbol_count);    a_denom_sum_recur[i] = a_denom(i,i,symbol_count);    a_denom_sum_next[i] = a_denom(i,j,symbol_count);    for (int k=0; k < max_symbols; k++) {      b_numer_sum_recur[i][k] = b_numer(i,i,k,symbol_array,symbol_count);      b_numer_sum_next[i][k] = b_numer(i,j,k,symbol_array,symbol_count);    }  }  // compute recurrent probs for last state  i = max_states-1;  a_numer_sum_recur[i] = a_numer(i,i,symbol_count);  a_denom_sum_recur[i] = a_denom(i,i,symbol_count);  for (int k=0; k < max_symbols; k++)    b_numer_sum_recur[i][k] = b_numer(i,i,k,symbol_array,symbol_count);  // delete [] symbol_array;  return(alpha_tot);}// reestimate parameters after calculating ab_counts//double HMM::reestimate(){  double prob, diff_sum = 0.0;  // loop all the transition probs  for (int i=0, j=1; j < max_states; i++, j++)    {      // do all the prob transitions except last one      prob = a_numer_sum_recur[i] / a_denom_sum_recur[i];      diff_sum += fabs(prob - States[i]->set_recur_trans());      States[i]->set_recur_trans(prob);      for (int k=0; k < max_symbols; k++) {	prob = b_numer_sum_recur[i][k] / a_numer_sum_recur[i];	diff_sum += fabs(prob -	States[i]->set_recur_out(k));	States[i]->set_recur_out(k,prob);      }      // do all the next transitions      prob = a_numer_sum_next[i] / a_denom_sum_next[i];      diff_sum += fabs(prob - States[i]->set_next_trans());      States[i]->set_next_trans(prob);      for (k=0; k < max_symbols; k++) {	prob = b_numer_sum_next[i][k] / a_numer_sum_next[i];	diff_sum += fabs(prob - States[i]->set_next_out(k));	States[i]->set_next_out(k,prob);      }    }  // calc the recurrent prob for last state  i = max_states-1;  prob = a_numer_sum_recur[i] / a_denom_sum_recur[i];  diff_sum += fabs(prob - States[i]->set_recur_trans());  States[i]->set_recur_trans(prob);  for (int k=0; k < max_symbols; k++) {    prob = b_numer_sum_recur[i][k] / a_numer_sum_recur[i];    diff_sum += fabs(prob - States[i]->set_recur_out(k));    States[i]->set_recur_out(k,prob);  }  return diff_sum;}// test model for a single string.//double HMM::test(int* string, int symbol_count){  double log_alpha;  symbol_count=(symbol_count<0?trellis_width-1:symbol_count);  // fill alpha trellis and scaling coeffs  log_alpha=log(alpha_F(string,symbol_count));  // calc log probability from coeffs  if (scaling_factors[0]>0.0) { // if scaling_factors set    log_alpha=0.0;    for (int t=0; t < symbol_count+1; t++)      log_alpha += log(scaling_factors[t]);  }  return log_alpha;}/************************* STATE DECLARATIONS ****************************/// create state object.//STATE::STATE(int num_symbols){  recur_out = new double [num_symbols];  next_out = new double [num_symbols];}// set/return the probability of generating a particluar symbol // during a recurrent transition. (smooth with MIN_PROB)//double STATE::set_recur_out(int symbol, double prob){  if (prob != -1.0) {    recur_out[symbol] = prob;  }  if (recur_out[symbol] < MIN_PROB)    recur_out[symbol] = MIN_PROB;  return recur_out[symbol];}// set/return the probability of generating a particluar symbol // during a transition to the next state. (smooth with MIN_PROB)//double STATE::set_next_out(int symbol, double prob){  if (prob != -1.0) {    next_out[symbol] = prob;  }  if (next_out[symbol] < MIN_PROB)    next_out[symbol] = MIN_PROB;  return next_out[symbol];}// set/return the probability of making a recurrent transition.// (smooth with MIN_PROB)//double STATE::set_recur_trans(double prob){  if (prob != -1.0) {    recur_trans = prob;  }  if (recur_trans < MIN_PROB)    recur_trans = MIN_PROB;  return recur_trans;}// set/return the probability of making transitioning to the next state.// (smooth with MIN_PROB)//double STATE::set_next_trans(double prob){  if (prob != -1.0) {    next_trans = prob;  }  if (next_trans < MIN_PROB)    next_trans = MIN_PROB;  return next_trans;}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -