📄 fngramlm.cc
字号:
HEX << node << DEC << " context \"" << (vocab.use(), context) << "\" -- incrementing denominator" << endl; } goto retry; } /* * Undo the reversal above so the iterator can continue correctly */ Vocab::reverse(context); } if (debug(DEBUG_ESTIMATE_WARNINGS)) { // use stupid C++ I/O to print in hex if (noneventContexts > 0) { dout() << "discarded " << noneventContexts << " " << HEX << node << "-gram contexts containing pseudo-events\n" << DEC; } if (noneventFNgrams > 0) { dout() << "discarded " << noneventFNgrams << " " << HEX << node << "-gram probs predicting pseudo-events\n" << DEC; } if (discountedFNgrams > 0) { dout() << "discarded " << discountedFNgrams << " " << HEX << node << "-gram probs discounted to zero\n" << DEC; } } // not finished with this yet (this has to do with keeping the counts in the LM file) // storeBOcounts(specNum,node); /* * With all the probs in place, BOWs are obtained simply by the usual * normalization. * We do this right away before computing probs of higher order since * the estimation of higher-order N-grams can refer to lower-order * ones (e.g., for interpolated estimates). */ // fprintf(stderr,"size now = %d\n",numFNgrams(0,0)); computeBOWs(specNum,node); } } // fprintf(stderr,"done computing LM\n");}/* * Compute the numerator and denominator of a backoff weight, checking * for sanity. Returns true if values make sense, and prints a * warning if not. This version assumes that there is only one * possible BG child in the node and can therefore compute the * denominator as 1-\sum_{grams with >0 counts} , which will * significantly speed things up in this case. */BooleanFNgram::computeBOW1child(BOnode *bo_node, // back off node const VocabIndex *context, const unsigned int nWrtwCip, unsigned int specNum, unsigned int node, // back-off *graph* node Prob &numerator, Prob &denominator){ /* * The BOW(c) for a context c is computed to be * * BOW(c) = (1 - Sum p(x | c)) / (1 - Sum p_BO(x | c)) * * where Sum is a summation over all words x with explicit * probabilities in context c, p(x|c) is that probability, and * p_BO(x|c) is the probability for that word according to the * backoff graph algorithm. In this case, it is assumed that * there is only one possible BG child, which is why * we can do the denominator in this way. */ LogP *prob; numerator = 1.0; denominator = 1.0; ProbsIter piter(*bo_node); VocabIndex word; while (prob = piter.next(word)) { numerator -= LogPtoProb(*prob); if (node != 0) denominator -= LogPtoProb(bgChildProbBO(word,context,nWrtwCip,specNum,node)); } /* * Avoid some predictable anomalies due to rounding errors */ if (numerator < 0.0 && numerator > -Prob_Epsilon) { numerator = 0.0; } if (denominator < 0.0 && denominator > -Prob_Epsilon) { denominator = 0.0; } if (numerator < 0.0) { cerr << "BOW numerator for context \"" << (vocab.use(), context) << "\" is " << numerator << " < 0\n"; return false; } else if (denominator <= 0.0) { if (numerator > Prob_Epsilon) { cerr << "BOW denominator for context \"" << (vocab.use(), context) << "\" is " << denominator << " <= 0," << "numerator is " << numerator << endl; return false; } else { numerator = 0.0; denominator = 0.0; /* will give bow = 1 */ return true; } } else { return true; }}/* * Compute the numerator and denominator of a backoff weight, * checking for sanity. Returns true if values make sense, * and prints a warning if not. * * Note: This version allows for an arbitrary backoff strategy, so * computes the denominator as \sum_{ngrams where counts are zero}. * This means the training algorithm runs much slower, but thats * what you pay for doing general backoff. Note that there is * no penalty once the language model has been trained. */BooleanFNgram::computeBOW(BOnode *bo_node, // back off node const VocabIndex *context, const unsigned int nWrtwCip, unsigned int specNum, unsigned int node, // back-off *graph* node Prob &numerator, Prob &denominator){ /* * The BOW(c) for a context c is computed to be * * BOW(c) = (1 - Sum p(x | c)) / Sum' p_BO(x | c)) * * where Sum is a summation over all words x with explicit probabilities * in context c, Sum' is over all words without explicit probs in * context c, p(x|c) is that probability, and p_BO(x|c) is the * probability for that word according to the backoff algorithm. */ LogP *prob; numerator = 1.0; denominator = 0.0; // double tmp_denominator = 1.0; ProbsIter piter(*bo_node); VocabIndex word; while ((prob = piter.next(word))) { numerator -= LogPtoProb(*prob); // if (node != 0) // tmp_denominator -= LogPtoProb(bgChildProbBO(word,context,nWrtwCip,specNum,node)); } if (node == 0) { // no reason to backoff to nothing. denominator = 0; } else { denominator = 0; /*************************************************************************** * * NOTE: This next loop is one of the reasons BG LM estimation runs so slowly. * *************************************************************************** */ FactoredVocab::TagIter titer(*((FactoredVocab*)&vocab)); while (titer.next(word)) { // accumulate only when count(word,context) == 0 // Count *cnt = fngs.fnSpecArray[specNum].parentSubsets[node].findCountR(context,word); // if (!cnt || *cnt == 0) { Boolean found; ProbEntry *pe = bo_node->probs.find(word,found); if (!found) { const LogP lp = bgChildProbBO(word,context,nWrtwCip,specNum,node); const Prob p = LogPtoProb(lp); denominator += p; } } } // fprintf(stderr,"tmp_denominator = %f\n",tmp_denominator); // denominator = tmp_denominator; /* * Avoid some predictable anomalies due to rounding errors */ if (numerator < 0.0 && numerator > -Prob_Epsilon) { numerator = 0.0; } if (numerator < 0.0) { cerr << "BOW numerator for context \"" << (vocab.use(), context) << "\" is " << numerator << " < 0\n"; return false; } else { return true; }}/* * Recompute backoff weight for all contexts of a given order */BooleanFNgram::computeBOWs(unsigned int specNum, unsigned int node){ Boolean result = true; /* * Note that this will only generate backoff nodes for those * contexts that have words with explicit probabilities. But * this is precisely as it should be. */ BOnode *bo_node; VocabIndex context[maxNumParentsPerChild + 2]; // fprintf(stderr,"computebow size now = %d\n",numFNgrams(0,0)); BOsIter iter1(*this, specNum, node, context); unsigned totalNumContexts = 0; if (fngs.fnSpecArray[specNum].parentSubsets[node].requiresGenBackoff()) { // print out message since this will take a while. fprintf(stderr, "Starting estimation of general graph-backoff node: LM %d Node 0x%X, children:",specNum,node); FNgramSpecsType::FNgramSpec::BGChildIterCnstr citer(fngs.fnSpecArray[specNum].numParents,node, fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint); unsigned child; while (citer.next(child)) { if (fngs.fnSpecArray[specNum].parentSubsets[child].counts != NULL) fprintf(stderr, " 0x%X",child); } fprintf(stderr, "\n"); if (totalNumContexts == 0) { // it is actually worth it to count the number of contexts here // to report a informative status message below. while (iter1.next()) { totalNumContexts++; } iter1.init(); } } unsigned iter = 0; while (bo_node = iter1.next()) { if (debug(DEBUG_BOWS) && fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren > 1) fprintf(stderr,"in computeBOWs, bo_node = 0x%X, specNum=%d, node = 0x%X, context = 0x%X, *context = %d, iter = %d, cword = %s, nc = %d\n", bo_node,specNum, node, context, *context,iter,vocab.getWord(*context), fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren); double numerator, denominator; if (((fngs.fnSpecArray[specNum].parentSubsets[node].requiresGenBackoff())) && computeBOW(bo_node, context, node,specNum, node,numerator,denominator)) { /* * If unigram probs leave a non-zero probability mass * then we should give that mass to the zero-order (uniform) * distribution for zeroton words. However, the ARPA * BO format doesn't support a "null context" BOW. * We simluate the intended distribution by spreading the * left-over mass uniformly over all vocabulary items that * have a zero probability. * * NOTE: We used to do this only if there was prob mass left, * but some ngram software requires all words to appear as * unigrams, which we achieve by giving them zero probability. * * NOTE2: since FNgramLM.cc doesn't use ARPA format files (it * uses ARPA format "inspired" files), this step really isn't * required any longer, as we could easily define a special * null context. We nevertheless keep the same convention here * for the unigram. * */ if (node == 0 /*&& numerator > 0.0*/) { distributeProb(specNum,node,numerator,context,node); } else if (numerator == 0.0) { bo_node->bow = LogP_One; } else { bo_node->bow = ProbToLogP(numerator) - ProbToLogP(denominator); // fprintf(stderr,"got bow, num = %f, den = %f, bow = %f\n", // numerator,denominator,bo_node->bow); } // should print out some message since this will be taking a long time. if (iter % 1000 == 0) fprintf(stderr, "Computing BOWs for LM %d, BG node 0x%X, Context num %d/%d [%.2f%%]\n", specNum, node, iter,totalNumContexts,100*(double)iter/totalNumContexts); } else if (!fngs.fnSpecArray[specNum].parentSubsets[node].requiresGenBackoff() && computeBOW1child(bo_node,context,node,specNum,node,numerator,denominator)) { if (node == 0 /*&& numerator > 0.0*/) { distributeProb(specNum,node,numerator,context,node); } else if (numerator == 0.0 && denominator == 0) { bo_node->bow = LogP_One; } else { bo_node->bow = ProbToLogP(numerator) - ProbToLogP(denominator); } } else { /* * Dummy value for improper models */ bo_node->bow = LogP_Zero; result = false; } iter++; } if (fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren > 1) { fprintf(stderr, "Finished estimation of multi-child graph-backoff node: LM %d Node 0x%X\n",specNum,node); } return result;}voidFNgram::storeBOcounts(unsigned int specNum, unsigned int node){ if (node == 0 || FNgramSpecsType::numBitsSet(node) == 1) return; VocabIndex context[maxNumParentsPerChild + 2]; BOsIter iter1(*this, specNum, node, context); BOnode *bo_node; while (bo_node = iter1.next()) { ProbsIter piter(*bo_node); VocabIndex word; unsigned int* cnt; LogP *prob; while (prob = piter.next(word,cnt)) { // get count and store it // *cnt = count for this node. } }}/* * Renormalize language model by recomputing backoff weights. */voidFNgram::recomputeBOWs(){ /* * Here it is important that we compute the backoff weights in * increasing BG level order, since the higher-order ones refer to the * lower-order ones in the backoff algorithm. * Note that this will only generate backoff nodes for those * contexts that have words with explicit probabilities. But * this is precisely as it should be. */ for (unsigned specNum=0;specNum<fNgramsSize;specNum++) { const unsigned numParents = fngs.fnSpecArray[specNum].numParents; for (int level=0;level<=(int)numParents;level++) { FNgramSpecsType::FNgramSpec::LevelIter iter(numParents,level); unsigned int node; while (iter.next(node)) { computeBOWs(specNum,node); } } }}/* * Redistribute probability mass over ngrams of given context */voidFNgram::distributeProb(unsigned int specNum, unsigned int node, Prob mass, VocabIndex *context, const unsigned nWrtwCip){ /* * First enumerate the vocabulary to count the number of * items affected */ unsigned numWords = 0; unsigned numZeroProbs = 0; const unsigned packedBits = bitGather(nWrtwCip,node); FactoredVocab::TagIter viter(*((FactoredVocab*)&vocab)); VocabIndex word; while (viter.next(word)) { if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) { numWords ++; LogP *prob = fNgrams[specNum].parentSubsets[node].findProbSubCtx(word, context, packedBits); if (!prob || *prob == LogP_Zero) { numZeroProbs ++; } /* * create zero probs so we can update them below */ if (!prob) { *(fNgrams[specNum].parentSubsets[node].insertProbSubCtx(word, context, packedBits)) = LogP_Zero; } } } /* * If there are no zero-probability words * then we add the left-over prob mass to all unigrams. * Otherwise do as described above. */ viter.init(); if (numZeroProbs > 0) { if (debug(DEBUG_ESTIMATE_WARNINGS)) { cerr << "warning: distributing " << mass << " left-over probability mass over " << numZeroProbs << " zeroton words" << endl; } Prob add = mass / numZeroProbs; while (viter.ne
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -