📄 fngramlm.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
		      HEX << node << DEC << " context \"" 			 << (vocab.use(), context)			 << "\" -- incrementing denominator"			 << endl;		}		goto retry;	    }	    /*	     * Undo the reversal above so the iterator can continue correctly	     */	    Vocab::reverse(context);	}	if (debug(DEBUG_ESTIMATE_WARNINGS)) { 	    // use stupid C++ I/O to print in hex	    if (noneventContexts > 0) {	      dout() << "discarded " << noneventContexts << " " << HEX		     << node << "-gram contexts containing pseudo-events\n" << DEC;	    }	    if (noneventFNgrams > 0) {	      dout() << "discarded " << noneventFNgrams << " " << HEX		     << node << "-gram probs predicting pseudo-events\n" << DEC;	    }	    if (discountedFNgrams > 0) {	      dout() << "discarded " << discountedFNgrams << " " << HEX		     << node << "-gram probs discounted to zero\n" << DEC;	    }	}	// not finished with this yet (this has to do with keeping the counts in the LM file)	// storeBOcounts(specNum,node);	/*	 * With all the probs in place, BOWs are obtained simply by the usual	 * normalization.	 * We do this right away before computing probs of higher order since 	 * the estimation of higher-order N-grams can refer to lower-order	 * ones (e.g., for interpolated estimates).	 */	// fprintf(stderr,"size now = %d\n",numFNgrams(0,0));	computeBOWs(specNum,node);      }    }    // fprintf(stderr,"done computing LM\n");}/* * Compute the numerator and denominator of a backoff weight, checking * for sanity.  Returns true if values make sense, and prints a * warning if not. This version assumes that there is only one * possible BG child in the node and can therefore compute the * denominator as 1-\sum_{grams with >0 counts} , which will * significantly speed things up in this case. */BooleanFNgram::computeBOW1child(BOnode *bo_node,  // back off node			 const VocabIndex *context,			 const unsigned int nWrtwCip,			 unsigned int specNum, 			 unsigned int node, // back-off *graph* node			 Prob &numerator, 			 Prob &denominator){    /*     * The BOW(c) for a context c is computed to be     *     *	BOW(c) = (1 - Sum p(x | c)) /  (1 - Sum p_BO(x | c))     *     * where Sum is a summation over all words x with explicit     * probabilities in context c, p(x|c) is that probability, and     * p_BO(x|c) is the probability for that word according to the     * backoff graph algorithm. In this case, it is assumed that     * there is only one possible BG child, which is why     * we can do the denominator in this way.     */    LogP *prob;    numerator = 1.0;    denominator = 1.0;    ProbsIter piter(*bo_node);    VocabIndex word;    while (prob = piter.next(word)) {      numerator -= LogPtoProb(*prob);      if (node != 0)	denominator -= LogPtoProb(bgChildProbBO(word,context,nWrtwCip,specNum,node));    }    /*     * Avoid some predictable anomalies due to rounding errors     */    if (numerator < 0.0 && numerator > -Prob_Epsilon) {	numerator = 0.0;    }    if (denominator < 0.0 && denominator > -Prob_Epsilon) {	denominator = 0.0;    }    if (numerator < 0.0) {	cerr << "BOW numerator for context \""	     << (vocab.use(), context)	     << "\" is " << numerator << " < 0\n";	return false;    } else if (denominator <= 0.0) {	if (numerator > Prob_Epsilon) {	    cerr << "BOW denominator for context \""		 << (vocab.use(), context)		 << "\" is " << denominator << " <= 0,"		 << "numerator is " << numerator		 << endl;	    return false;	} else {	    numerator = 0.0;	    denominator = 0.0;	/* will give bow = 1 */	    return true;	}    } else {	return true;    }}/* * Compute the numerator and denominator of a backoff weight, * checking for sanity.  Returns true if values make sense, * and prints a warning if not.  *  * Note: This version allows for an arbitrary backoff strategy, so * computes the denominator as \sum_{ngrams where counts are zero}. * This means the training algorithm runs much slower, but thats * what you pay for doing general backoff. Note that there is * no penalty once the language model has been trained. */BooleanFNgram::computeBOW(BOnode *bo_node,  // back off node		   const VocabIndex *context,		   const unsigned int nWrtwCip,		   unsigned int specNum, 		   unsigned int node, // back-off *graph* node		   Prob &numerator, 		   Prob &denominator){    /*     * The BOW(c) for a context c is computed to be     *     *	BOW(c) = (1 - Sum p(x | c)) /  Sum' p_BO(x | c))     *     * where Sum is a summation over all words x with explicit probabilities     * in context c, Sum' is over all words without explicit probs in     * context c, p(x|c) is that probability, and p_BO(x|c) is the      * probability for that word according to the backoff algorithm.     */    LogP *prob;    numerator = 1.0;    denominator = 0.0;    // double tmp_denominator = 1.0;    ProbsIter piter(*bo_node);    VocabIndex word;    while ((prob = piter.next(word))) {      numerator -= LogPtoProb(*prob);      // if (node != 0)      // tmp_denominator -= LogPtoProb(bgChildProbBO(word,context,nWrtwCip,specNum,node));    }    if (node == 0) {      // no reason to backoff to nothing.      denominator = 0;    } else {      denominator = 0;      /***************************************************************************       *       *    NOTE: This next loop is one of the reasons BG LM estimation runs so slowly.       *       ***************************************************************************       */      FactoredVocab::TagIter titer(*((FactoredVocab*)&vocab));      while (titer.next(word)) {	// accumulate only when count(word,context) == 0	// Count *cnt = fngs.fnSpecArray[specNum].parentSubsets[node].findCountR(context,word);	// if (!cnt || *cnt == 0) {	Boolean found;	ProbEntry *pe = bo_node->probs.find(word,found);	if (!found) {	  const LogP lp = bgChildProbBO(word,context,nWrtwCip,specNum,node);	  const Prob p = LogPtoProb(lp);	  denominator += p;	}      }    }    // fprintf(stderr,"tmp_denominator = %f\n",tmp_denominator);    // denominator = tmp_denominator;    /*     * Avoid some predictable anomalies due to rounding errors     */    if (numerator < 0.0 && numerator > -Prob_Epsilon) {	numerator = 0.0;    }    if (numerator < 0.0) {	cerr << "BOW numerator for context \""	     << (vocab.use(), context)	     << "\" is " << numerator << " < 0\n";	return false;    } else {	return true;    }}/* * Recompute backoff weight for all contexts of a given order */BooleanFNgram::computeBOWs(unsigned int specNum, unsigned int node){    Boolean result = true;    /*     * Note that this will only generate backoff nodes for those     * contexts that have words with explicit probabilities.  But     * this is precisely as it should be.     */    BOnode *bo_node;    VocabIndex context[maxNumParentsPerChild + 2];        // fprintf(stderr,"computebow size now = %d\n",numFNgrams(0,0));    BOsIter iter1(*this, specNum, node, context);    unsigned totalNumContexts = 0;    if (fngs.fnSpecArray[specNum].parentSubsets[node].requiresGenBackoff()) {      // print out message since this will take a while.      fprintf(stderr, "Starting estimation of general graph-backoff node: LM %d Node 0x%X, children:",specNum,node);      FNgramSpecsType::FNgramSpec::BGChildIterCnstr	citer(fngs.fnSpecArray[specNum].numParents,node,	      fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint);      unsigned child;      while (citer.next(child)) {	if (fngs.fnSpecArray[specNum].parentSubsets[child].counts != NULL)	  fprintf(stderr, " 0x%X",child);      }      fprintf(stderr, "\n");      if (totalNumContexts == 0) {	// it is actually worth it to count the number of contexts here	// to report a informative status message below.	while (iter1.next()) {	  totalNumContexts++;	}	iter1.init();      }    }    unsigned iter = 0;    while (bo_node = iter1.next()) {      if (debug(DEBUG_BOWS) && fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren > 1)	fprintf(stderr,"in computeBOWs, bo_node = 0x%X, specNum=%d, node = 0x%X, context = 0x%X, *context = %d, iter = %d, cword = %s, nc = %d\n",		bo_node,specNum, node, context, *context,iter,vocab.getWord(*context),		fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren);      double numerator, denominator;      if (((fngs.fnSpecArray[specNum].parentSubsets[node].requiresGenBackoff())) &&	  computeBOW(bo_node, context, node,specNum, node,numerator,denominator)) {	/*	 * If unigram probs leave a non-zero probability mass	 * then we should give that mass to the zero-order (uniform)	 * distribution for zeroton words.  However, the ARPA	 * BO format doesn't support a "null context" BOW.	 * We simluate the intended distribution by spreading the	 * left-over mass uniformly over all vocabulary items that	 * have a zero probability.	 *	 * NOTE: We used to do this only if there was prob mass left,	 * but some ngram software requires all words to appear as	 * unigrams, which we achieve by giving them zero probability.	 *	 * NOTE2: since FNgramLM.cc doesn't use ARPA format files (it	 * uses ARPA format "inspired" files), this step really isn't	 * required any longer, as we could easily define a special	 * null context. We nevertheless keep the same convention here	 * for the unigram.	 *	 */	if (node == 0 /*&& numerator > 0.0*/) {	  distributeProb(specNum,node,numerator,context,node);	} else if (numerator == 0.0) {	  bo_node->bow = LogP_One;	} else {	  bo_node->bow = ProbToLogP(numerator) - ProbToLogP(denominator);	  // fprintf(stderr,"got bow, num = %f, den = %f, bow = %f\n",	  // numerator,denominator,bo_node->bow);	}	// should print out some message since this will be taking a long time.	if (iter % 1000 == 0)	  fprintf(stderr,		 "Computing BOWs for LM %d, BG node 0x%X, Context num %d/%d [%.2f%%]\n",		 specNum,		 node,		 iter,totalNumContexts,100*(double)iter/totalNumContexts);      } else if (!fngs.fnSpecArray[specNum].parentSubsets[node].requiresGenBackoff() 		 && computeBOW1child(bo_node,context,node,specNum,node,numerator,denominator)) { 	if (node == 0 /*&& numerator > 0.0*/) { 	  distributeProb(specNum,node,numerator,context,node); 	} else if (numerator == 0.0 && denominator == 0) { 	  bo_node->bow = LogP_One; 	} else { 	  bo_node->bow = ProbToLogP(numerator) - ProbToLogP(denominator);	}      } else {	/*	 * Dummy value for improper models	 */	bo_node->bow = LogP_Zero;	result = false;      }      iter++;    }    if (fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren > 1) {      fprintf(stderr, "Finished estimation of multi-child graph-backoff node: LM %d Node 0x%X\n",specNum,node);    }    return result;}voidFNgram::storeBOcounts(unsigned int specNum, unsigned int node){  if (node == 0 || FNgramSpecsType::numBitsSet(node) == 1)    return;  VocabIndex context[maxNumParentsPerChild + 2];  BOsIter iter1(*this, specNum, node, context);  BOnode *bo_node;  while (bo_node = iter1.next()) {    ProbsIter piter(*bo_node);    VocabIndex word;    unsigned int* cnt;    LogP *prob;    while (prob = piter.next(word,cnt)) {      // get count and store it      // *cnt = count for this node.    }  }}/* * Renormalize language model by recomputing backoff weights. */voidFNgram::recomputeBOWs(){    /*     * Here it is important that we compute the backoff weights in     * increasing BG level order, since the higher-order ones refer to the     * lower-order ones in the backoff algorithm.     * Note that this will only generate backoff nodes for those     * contexts that have words with explicit probabilities.  But     * this is precisely as it should be.     */  for (unsigned specNum=0;specNum<fNgramsSize;specNum++) {    const unsigned numParents = fngs.fnSpecArray[specNum].numParents;    for (int level=0;level<=(int)numParents;level++) {      FNgramSpecsType::FNgramSpec::LevelIter iter(numParents,level);      unsigned int node;      while (iter.next(node)) {	computeBOWs(specNum,node);      }    }  }}/* * Redistribute probability mass over ngrams of given context */voidFNgram::distributeProb(unsigned int specNum,		       unsigned int node,		       Prob mass, 		       VocabIndex *context,		       const unsigned nWrtwCip){    /*     * First enumerate the vocabulary to count the number of     * items affected     */    unsigned numWords = 0;    unsigned numZeroProbs = 0;    const unsigned packedBits = bitGather(nWrtwCip,node);    FactoredVocab::TagIter viter(*((FactoredVocab*)&vocab));    VocabIndex word;    while (viter.next(word)) {	if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) {	    numWords ++;	    LogP *prob = 	      fNgrams[specNum].parentSubsets[node].findProbSubCtx(word,								  context,								  packedBits);	    if (!prob || *prob == LogP_Zero) {		numZeroProbs ++;	    }	    /*	     * create zero probs so we can update them below	     */	    if (!prob) {	      *(fNgrams[specNum].parentSubsets[node].insertProbSubCtx(word, 								      context,								      packedBits))		= LogP_Zero;	    }	}    }    /*     * If there are no zero-probability words      * then we add the left-over prob mass to all unigrams.     * Otherwise do as described above.     */    viter.init();    if (numZeroProbs > 0) {	if (debug(DEBUG_ESTIMATE_WARNINGS)) {	    cerr << "warning: distributing " << mass		 << " left-over probability mass over "		 << numZeroProbs << " zeroton words" << endl;	}	Prob add = mass / numZeroProbs;	while (viter.ne
上一页 1 2 3 45
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -