📄 ngramlm.cc
字号:
double numerator, denominator;
if (computeBOW(node, context, order, numerator, denominator)) {
/*
* If unigram probs leave a non-zero probability mass
* then we should give that mass to the zero-order (uniform)
* distribution for zeroton words. However, the ARPA
* BO format doesn't support a "null context" BOW.
* We simluate the intended distribution by spreading the
* left-over mass uniformly over all vocabulary items that
* have a zero probability.
* NOTE: We used to do this only if there was prob mass left,
* but some ngram software requires all words to appear as
* unigrams, which we achieve by giving them zero probability.
*/
if (order == 0 /*&& numerator > 0.0*/) {
distributeProb(numerator, context);
} else if (numerator == 0.0 && denominator == 0) {
node->bow = LogP_One;
} else {
node->bow = ProbToLogP(numerator) - ProbToLogP(denominator);
}
} else {
/*
* Dummy value for improper models
*/
node->bow = LogP_Zero;
result = false;
}
}
return result;
}
/*
* Renormalize language model by recomputing backoff weights.
*/
void
Ngram::recomputeBOWs()
{
/*
* Here it is important that we compute the backoff weights in
* increasing order, since the higher-order ones refer to the
* lower-order ones in the backoff algorithm.
* Note that this will only generate backoff nodes for those
* contexts that have words with explicit probabilities. But
* this is precisely as it should be.
*/
for (unsigned i = 0; i < order; i++) {
computeBOWs(i);
}
}
/*
* Prune probabilities from model so that the change in training set perplexity
* is below a threshold.
* The minorder parameter limits pruning to ngrams of that length and above.
*/
void
Ngram::pruneProbs(double threshold, unsigned minorder)
{
/*
* Hack alert: allocate the context buffer for NgramBOsIter, but leave
* room for a word id to be prepended.
*/
makeArray(VocabIndex, wordPlusContext, order + 2);
VocabIndex *context = &wordPlusContext[1];
for (unsigned i = order - 1; i > 0 && i >= minorder - 1; i--) {
unsigned prunedNgrams = 0;
BOnode *node;
NgramBOsIter iter1(*this, context, i);
while (node = iter1.next()) {
LogP bow = node->bow; /* old backoff weight, BOW(h) */
double numerator, denominator;
/*
* Compute numerator and denominator of the backoff weight,
* so that we can quickly compute the BOW adjustment due to
* leaving out one prob.
*/
if (!computeBOW(node, context, i, numerator, denominator)) {
continue;
}
/*
* Compute the marginal probability of the context, P(h)
*/
LogP cProb = contextProb(context, i);
NgramProbsIter piter(*node);
VocabIndex word;
LogP *ngramProb;
Boolean allPruned = true;
while (ngramProb = piter.next(word)) {
/*
* lower-order estimate for ngramProb, P(w|h')
*/
LogP backoffProb = wordProbBO(word, context, i - 1);
/*
* Compute BOW after removing ngram, BOW'(h)
*/
LogP newBOW =
ProbToLogP(numerator + LogPtoProb(*ngramProb)) -
ProbToLogP(denominator + LogPtoProb(backoffProb));
/*
* Compute change in entropy due to removal of ngram
* deltaH = - P(H) x
* {P(W | H) [log P(w|h') + log BOW'(h) - log P(w|h)] +
* (1 - \sum_{v,h ngrams} P(v|h)) [log BOW'(h) - log BOW(h)]}
*
* (1-\sum_{v,h ngrams}) is the probability mass left over from
* ngrams of the current order, and is the same as the
* numerator in BOW(h).
*/
LogP deltaProb = backoffProb + newBOW - *ngramProb;
Prob deltaEntropy = - LogPtoProb(cProb) *
(LogPtoProb(*ngramProb) * deltaProb +
numerator * (newBOW - bow));
/*
* compute relative change in model (training set) perplexity
* (PPL' - PPL)/PPL = PPL'/PPL - 1
* = exp(H')/exp(H) - 1
* = exp(H' - H) - 1
*/
double perpChange = LogPtoProb(deltaEntropy) - 1.0;
Boolean pruned = threshold > 0 && perpChange < threshold;
/*
* Make sure we don't prune ngrams whose backoff nodes are
* needed ...
*/
if (pruned) {
/*
* wordPlusContext[1 ... i-1] was already filled in by
* NgramBOIter. Just add the word to complete the ngram.
*/
wordPlusContext[0] = word;
if (findBOW(wordPlusContext)) {
pruned = false;
}
}
if (debug(DEBUG_ESTIMATES)) {
dout() << "CONTEXT " << (vocab.use(), context)
<< " WORD " << vocab.getWord(word)
<< " CONTEXTPROB " << cProb
<< " OLDPROB " << *ngramProb
<< " NEWPROB " << (backoffProb + newBOW)
<< " DELTA-H " << deltaEntropy
<< " DELTA-LOGP " << deltaProb
<< " PPL-CHANGE " << perpChange
<< " PRUNED " << pruned
<< endl;
}
if (pruned) {
removeProb(word, context);
prunedNgrams ++;
} else {
allPruned = false;
}
}
/*
* If we removed all ngrams for this context we can
* remove the context itself.
* Note: Due to the check above this also means there
* are no contexts that extend the current one, so
* removing this node won't leave any children orphaned.
*/
if (allPruned) {
removeBOW(context);
}
}
if (debug(DEBUG_ESTIMATE_WARNINGS)) {
if (prunedNgrams > 0) {
dout() << "pruned " << prunedNgrams << " "
<< (i + 1) << "-grams\n";
}
}
}
recomputeBOWs();
}
/*
* Prune low probability N-grams
* Removes all N-gram probabilities that are lower than the
* corresponding backed-off estimates. This is a crucial preprocessing
* step for N-gram models before conversion to finite-state networks.
*/
void
Ngram::pruneLowProbs(unsigned minorder)
{
makeArray(VocabIndex, context, order);
Boolean havePruned;
/*
* Since pruning changes backoff weights we have to keep pruning
* interatively as long as N-grams keep being removed.
*/
do {
havePruned = false;
for (unsigned i = minorder - 1; i < order; i++) {
unsigned prunedNgrams = 0;
BOnode *node;
NgramBOsIter iter1(*this, context, i);
while (node = iter1.next()) {
LogP bow = node->bow; /* old backoff weight, BOW(h) */
NgramProbsIter piter(*node);
VocabIndex word;
LogP *ngramProb;
while (ngramProb = piter.next(word)) {
/*
* lower-order estimate for ngramProb, P(w|h')
*/
LogP backoffProb = wordProbBO(word, context, i - 1);
if (backoffProb + bow > *ngramProb) {
if (debug(DEBUG_ESTIMATES)) {
dout() << "CONTEXT " << (vocab.use(), context)
<< " WORD " << vocab.getWord(word)
<< " LPROB " << *ngramProb
<< " BACKOFF-LPROB " << (backoffProb + bow)
<< " PRUNED\n";
}
removeProb(word, context);
prunedNgrams ++;
}
}
}
if (prunedNgrams > 0) {
havePruned = true;
if (debug(DEBUG_ESTIMATE_WARNINGS)) {
dout() << "pruned " << prunedNgrams << " "
<< (i + 1) << "-grams\n";
}
}
}
recomputeBOWs();
} while (havePruned);
fixupProbs();
}
/*
* Reassign ngram probabilities using a different LM
*/
void
Ngram::rescoreProbs(LM &lm)
{
makeArray(VocabIndex, context, order + 1);
for (int i = order - 1; i >= 0 ; i--) {
BOnode *node;
NgramBOsIter iter(*this, context, i);
/*
* Enumerate all explicit ngram probs in *this, and recompute their
* probs using the supplied LM.
*/
while (node = iter.next()) {
NgramProbsIter piter(*node);
VocabIndex word;
LogP *prob;
while (prob = piter.next(word)) {
*prob = lm.wordProb(word, context);
}
}
}
recomputeBOWs();
}
/*
* Insert probabilities for all context ngrams
*
* The ARPA format forces us to have a probability
* for every context ngram. So we create one if necessary and
* set the probability to the same as what the backoff algorithm
* would compute (so as not to change the distribution).
*/
void
Ngram::fixupProbs()
{
makeArray(VocabIndex, context, order + 1);
/*
* we cannot insert entries into the context trie while we're
* iterating over it, so we need another trie to collect
* the affected contexts first. yuck.
* Note: We use the same trie type as in the Ngram model itself,
* so we don't need to worry about another template instantiation.
*/
Trie<VocabIndex,NgramCount> contextsToAdd;
/*
* Find the contexts xyz that don't have a probability p(z|xy) in the
* model already.
*/
unsigned i;
for (i = 1; i < order; i++) {
NgramBOsIter iter(*this, context, i);
while (iter.next()) {
/*
* For a context abcd we need to create probability entries
* p(d|abc), p(c|ab), p(b|a), p(a) if necessary.
* If any of these is found in that order we can stop since a
* previous pass will already have created the remaining ones.
*/
for (unsigned j = 0; j < i; j++) {
LogP *prob = findProb(context[j], &context[j+1]);
if (prob) {
break;
} else {
/*
* Just store something non-zero to distinguish from
* internal nodes that are added implicitly.
*/
*contextsToAdd.insert(&context[j]) = 1;
}
}
}
}
/*
* Store the missing probs
*/
for (i = 1; i < order; i++) {
unsigned numFakes = 0;
Trie<VocabIndex,NgramCount> *node;
TrieIter2<VocabIndex,NgramCount> iter(contextsToAdd, context, i);
while (node = iter.next()) {
if (node->value()) {
numFakes ++;
/*
* Note: we cannot combine the two statements below
* since insertProb() creates a zero prob entry, which would
* prevent wordProbBO() from computing the backed-off
* estimate!
*/
LogP backoffProb = wordProbBO(context[0], &context[1], i - 1);
*insertProb(context[0], &(context[1])) = backoffProb;
if (debug(DEBUG_FIXUP_WARNINGS)) {
dout() << "faking probability for context "
<< (vocab.use(), context) << endl;
}
}
}
if (debug(DEBUG_ESTIMATE_WARNINGS)) {
if (numFakes > 0) {
dout() << "inserted " << numFakes << " redundant "
<< i << "-gram probs\n";
}
}
}
}
/*
* Redistribute probability mass over ngrams of given context
*/
void
Ngram::distributeProb(Prob mass, VocabIndex *context)
{
/*
* First enumerate the vocabulary to count the number of
* items affected
*/
unsigned numWords = 0;
unsigned numZeroProbs = 0;
VocabIter viter(vocab);
VocabIndex word;
while (viter.next(word)) {
if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) {
numWords ++;
LogP *prob = findProb(word, context);
if (!prob || *prob == LogP_Zero) {
numZeroProbs ++;
}
/*
* create zero probs so we can update them below
*/
if (!prob) {
*insertProb(word, context) = LogP_Zero;
}
}
}
/*
* If there are no zero-probability words
* then we add the left-over prob mass to all unigrams.
* Otherwise do as described above.
*/
viter.init();
if (numZeroProbs > 0) {
if (debug(DEBUG_ESTIMATE_WARNINGS)) {
cerr << "warning: distributing " << mass
<< " left-over probability mass over "
<< numZeroProbs << " zeroton words" << endl;
}
Prob add = mass / numZeroProbs;
while (viter.next(word)) {
if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) {
LogP *prob = insertProb(word, context);
if (*prob == LogP_Zero) {
*prob = ProbToLogP(add);
}
}
}
} else {
if (mass > 0.0 && debug(DEBUG_ESTIMATE_WARNINGS)) {
cerr << "warning: distributing " << mass
<< " left-over probability mass over all "
<< numWords << " words" << endl;
}
Prob add = mass / numWords;
while (viter.next(word)) {
if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) {
LogP *prob = insertProb(word, context);
*prob = ProbToLogP(LogPtoProb(*prob) + add);
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -