📄 fngramlm.cc
字号:
unsigned int howmany = 0; BOsIter iter(*this,specNum,node,context); BOnode *bo_node; while (bo_node = iter.next()) { howmany += bo_node->probs.numEntries(); } return howmany;}/* * Estimation *//* * Count number of vocabulary items that get probability mass * for the current tag set. */unsignedFNgram::vocabSize(){ unsigned numWords = 0; FactoredVocab::TagIter viter(*((FactoredVocab*)&vocab)); VocabIndex word; while (viter.next(word)) { if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) { numWords ++; } } return numWords;}voidFNgram::estimate(){ for (unsigned specNum=0;specNum<fNgramsSize;specNum++) { estimate(specNum); } if (debug(DEBUG_ESTIMATE_LM)) wordProbSum();}voidFNgram::estimate(const unsigned int specNum){ assert ( specNum < fNgramsSize ); // this is necessary so that estimate() doesn't smooth // over the wrong random variable value set. ((FactoredVocab*)&vocab)->setCurrentTagVocab(fngs.fnSpecArray[specNum].child); /* * For all ngrams, compute probabilities and apply the discount * coefficients. */ VocabIndex context[maxNumParentsPerChild+2]; unsigned vocabSize = FNgram::vocabSize(); /* * Remove all old contexts */ clear(specNum); /* * Ensure <s> unigram exists (being a non-event, it is not inserted * in distributeProb(), yet is assumed by much other software). */ if (vocab.ssIndex() != Vocab_None) { context[0] = Vocab_None; *(fNgrams[specNum].parentSubsets[0].insertProb(vocab.ssIndex(), context)) = LogP_Zero; } // By convention, the lowest numeric level in the backoff graph (BG) // (level zero) has one node corresponding to all LM parents, so this // node has value (2^(numParents)-1) (all bits on). The highest numeric level // (level numParents) has one node with no LM parents, so this node // has value 0 (all bits off). All other BG nodes have some intermediary // number of bits, and the bit pattern determines the number of LM parents // that are currently active. See the ASCII pictures in FNgramSpecs.h. // We learn the LM by doing a reverse BG level level-iterator, // starting with the unigram, and moving up to the all LM-parents case. const unsigned numParents = fngs.fnSpecArray[specNum].numParents; for (int level=0;level<=(int)numParents;level++) { FNgramSpecsType::FNgramSpec::LevelIter iter(numParents,level); unsigned int node; while (iter.next(node)) { // check if backoff constraints turned off this node, if // so we don't learn it. if (fngs.fnSpecArray[specNum].parentSubsets[node].counts == NULL) continue; const unsigned numBitsSetInNode = FNgramSpecsType::numBitsSet(node); unsigned noneventContexts = 0; unsigned noneventFNgrams = 0; unsigned discountedFNgrams = 0; /* * check if discounting is disabled for this round */ Boolean noDiscount = (fngs.fnSpecArray[specNum].parentSubsets[node].discount == 0) || fngs.fnSpecArray[specNum].parentSubsets[node].discount->nodiscount(); // fprintf(stderr,"noDiscount = %d\n",noDiscount); Boolean interpolate = (fngs.fnSpecArray[specNum].parentSubsets[node].discount != 0) && fngs.fnSpecArray[specNum].parentSubsets[node].discount->interpolate; /* * assume counts are already "prepared" (which is always true for factored counts) */ /* * This enumerates all parent set contexts, i.e., i-1 "grams" */ FNgramCount *contextCount; assert ( fngs.fnSpecArray[specNum].parentSubsets[node].order > 0 ); FNgramSpecsType::FNgramSpec::PSIter contextIter(fngs.fnSpecArray[specNum].parentSubsets[node], context, fngs.fnSpecArray[specNum].parentSubsets[node].order-1); while (contextCount = contextIter.next()) { /* * If <unk> is not real word, skip contexts that contain * it. Note that original code checked for </s> here in * context. We do not do this here because: * 1) count tries here are for conditional distributions where * the leaves of the trie is the child, and the non-leaves * are a set of parent variables. * 2) a context might contain a </s>, say in P(M_t|R_t,R_t-1) * where the context comes from the same time point. * 3) It is not clear why you would need to do doubling here * since short word strings are effectively elongated * in FNgramCounts<CountT>::countSentence() by * having any context that goes earlier than the beginning * of the sentence be replaced with <s> (so we can deal * with sentences shorter than the gram temporal width, * but this is only the case if the -no-virtual-begin-sentence * option is not set). * Note also, that this means that this code will have * fewer pseudo-events when comparing with ngram-count.cc. */ if (vocab.isNonEvent(vocab.unkIndex()) && vocab.contains(context, vocab.unkIndex())) { noneventContexts ++; continue; } VocabIndex word[2]; /* the follow word */ FNgramSpecsType::FNgramSpec::PSIter followIter(fngs.fnSpecArray[specNum].parentSubsets[node], context,word,1); FNgramCount *ngramCount; /* * Total up the counts for the denominator * (the lower-order counts are not consistent with * the higher-order ones, so we can't just use *contextCount) * I.e., we assume here that trustTotal flag is always false. */ FNgramCount totalCount = 0; Count observedVocab = 0, min2Vocab = 0, min3Vocab = 0; while (ngramCount = followIter.next()) { if (vocab.isNonEvent(word[0]) || ngramCount == 0 || node == 0 && vocab.isMetaTag(word[0])) { continue; } if (!vocab.isMetaTag(word[0])) { totalCount += *ngramCount; observedVocab ++; if (*ngramCount >= 2) { min2Vocab ++; } if (*ngramCount >= 3) { min3Vocab ++; } } else { /* * Process meta-counts */ unsigned type = vocab.typeOfMetaTag(word[0]); if (type == 0) { /* * a count total: just add to the totalCount * the corresponding type count can't be known, * but it has to be at least 1 */ totalCount += *ngramCount; observedVocab ++; } else { /* * a count-of-count: increment the word type counts, * and infer the totalCount */ totalCount += type * *ngramCount; observedVocab += (Count)*ngramCount; if (type >= 2) { min2Vocab += (Count)*ngramCount; } if (type >= 3) { min3Vocab += (Count)*ngramCount; } } } } if (totalCount == 0) { continue; } /* * reverse the context ngram since that's how * the BO nodes are indexed. More specifically, * normal count tries are organized as follows: * x1 y1 z1 c * x1 y1 z2 c * ... * x1 y1 zn c * x1 y2 z1 c * x1 y2 z2 c * ... * x1 y2 zn c * and so on, where c is the count for the context, * and where z follows y which follows x in the input * files, so we might want to compute p(z|x,y). I.e., * the x's are at the root, and the z's are at the leaves * of the count trie. * * When we get one of these 'context' wid strings, we are * iterating over the x_i and y_i's that exist in the * corresponding trie object. (the z's are the follow iter above) * * When LM tries are created, however, we insert them using * a reversed context, so what gets inserted used to index * the triee is "y x" and then z is in the hash table at that node. * * * In flm file, we've got * * C | P1 P2 P3 * * here, P1 = bit0, P1 = bit1, P3 = bit2 * where a bit vector is organzied in an integer as: * * [... bit2 bit1 bit0] * * In count file, contexts are given as * * word[0] = P3, word[1] = P2, word[2] = P1 [ word[3] = C ] * = bit2 bit1 bit0 * * when we reverse the context, we've got * * word[0] = P1, word[1] = P2, word[2] = P3 * = bit0 bit1 bit2 * * or if we include the child, * * word[0] = C, word[1] = P1, word[2] = P2, word[3] = P3 * = bit0 bit1 bit2 * * In a normal LM (as is done in NgramLM), the reason for * reversing things in the LM but not in the counts (I * believe) is that it is faster to backoff to a parent * node in the trie rather than having to re-index down * along a completely different path to another node in * the trie. * * For a general graph backoff, however, this reversal * isn't really necessary since we need to index up into * an entirely different LM trie when we backoff so the * speed advantage is lost. We keep doing the reversal * here, however, just to stay consistent with the * code in NgramLM.cc. This means, however, that * we need to reverse wid count lookup routines (as * defined in NgramSpecs.h). * */ Vocab::reverse(context); /* * Compute the discounted probabilities * from the counts and store them in the backoff model. */ retry: followIter.init(); Prob totalProb = 0.0; while (ngramCount = followIter.next()) { LogP lprob; double discountCoeff; /* * zero counts, * Officially, we shouldn't see this for a FLM count trie, * except possibly at node 0. */ if (node != 0 && *ngramCount == 0) { fprintf(stderr,"WARNING: FNgramLM: node = 0x%X, *ngramCount = %d\n", node, (unsigned)*ngramCount); cerr << "context: " << (vocab.use(), context) << endl; cerr << "word: " << word << endl; // assert (0); // NOTE: if this happens, check the count // modification code in discount.cc. This might // also be a result of running fngram-count with // the "-no-virtual-begin-sentence" option. } if (vocab.isNonEvent(word[0]) || vocab.isMetaTag(word[0])) { /* * Discard all pseudo-word probabilities, * except for unigrams. For unigrams, assign * probability zero. This will leave them with * prob zero in all cases, due to the backoff * algorithm. * Also discard the <unk> token entirely in closed * vocab models, its presence would prevent OOV * detection when the model is read back in. */ if (node != 0 || word[0] == vocab.unkIndex() || vocab.isMetaTag(word[0])) { noneventFNgrams ++; continue; } lprob = LogP_Zero; discountCoeff = 1.0; } else { /* * Ths discount array passed may contain 0 elements * to indicate no discounting at this order. */ if (noDiscount) { discountCoeff = 1.0; // fprintf(stderr,"setting discount to 1.0\n"); } else { discountCoeff = fngs.fnSpecArray[specNum]. parentSubsets[node].discount->discount(*ngramCount, totalCount, observedVocab); // fprintf(stderr,"*ngramCount = %d, setting discount to %f\n", // *ngramCount,discountCoeff); } Prob prob = (discountCoeff * *ngramCount) / totalCount; /* * For interpolated estimates we compute the weighted * linear combination of the high-order estimate * (computed above) and the lower-order estimate. * The high-order weight is given by the discount factor, * the lower-order weight is obtained from the Discount * method (it may be 0 if the method doesn't support * interpolation). */ double lowerOrderWeight; LogP lowerOrderProb; if (interpolate) { lowerOrderWeight = fngs.fnSpecArray[specNum]. parentSubsets[node].discount->lowerOrderWeight(totalCount, observedVocab, min2Vocab, min3Vocab); if (node > 0) { lowerOrderProb = bgChildProbBO(word[0],context,node,specNum,node); } else { lowerOrderProb = (LogP)(- log10((double)vocabSize)); } prob += lowerOrderWeight * LogPtoProb(lowerOrderProb); } if (discountCoeff != 0.0) { totalProb += prob; } lprob = ProbToLogP(prob); if (discountCoeff != 0.0 && debug(DEBUG_ESTIMATES)) { dout() << "CONTEXT " << (vocab.use(), context) << " WORD " << vocab.getWord(word[0]) << " NUMER " << *ngramCount << " DENOM " << totalCount << " DISCOUNT " << discountCoeff; if (interpolate) { dout() << " LOW " << lowerOrderWeight << " LOLPROB " << lowerOrderProb; } dout() << " LPROB " << lprob << endl; } } /* * A discount coefficient of zero indicates this ngram * should be omitted entirely (to save space and to * ensure that BOW code below works). */ if (discountCoeff == 0.0) { discountedFNgrams ++; fNgrams[specNum].parentSubsets[node]. removeProb(word[0],context); } else { // fprintf(stderr,"inserting word %d\n",word[0]); *(fNgrams[specNum].parentSubsets[node]. insertProb(word[0],context)) = lprob; // fprintf(stderr,"size now = %d\n",numFNgrams(specNum,node)); } } /* * This is a hack credited to Doug Paul (by Roni Rosenfeld in * his CMU tools). It may happen that no probability mass * is left after totalling all the explicit probs, typically * because the discount coefficients were out of range and * forced to 1.0. To arrive at some non-zero backoff mass * we try incrementing the denominator in the estimator by 1. */ if (!noDiscount && totalCount > 0 && totalProb > 1.0 - Prob_Epsilon) { totalCount += 1; if (debug(DEBUG_ESTIMATE_WARNINGS)) { cerr << "warning: " << (1.0 - totalProb) << " backoff probability mass left for " <<
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -