📄 ngramlm.cc
字号:
CountType totalCount = 0;
Count observedVocab = 0, min2Vocab = 0, min3Vocab = 0;
while (ngramCount = followIter.next()) {
if (vocab.isNonEvent(word[0]) ||
ngramCount == 0 ||
i == 1 && vocab.isMetaTag(word[0]))
{
continue;
}
if (!vocab.isMetaTag(word[0])) {
totalCount += *ngramCount;
observedVocab ++;
if (*ngramCount >= 2) {
min2Vocab ++;
}
if (*ngramCount >= 3) {
min3Vocab ++;
}
} else {
/*
* Process meta-counts
*/
unsigned type = vocab.typeOfMetaTag(word[0]);
if (type == 0) {
/*
* a count total: just add to the totalCount
* the corresponding type count can't be known,
* but it has to be at least 1
*/
totalCount += *ngramCount;
observedVocab ++;
} else {
/*
* a count-of-count: increment the word type counts,
* and infer the totalCount
*/
totalCount += type * *ngramCount;
observedVocab += (Count)*ngramCount;
if (type >= 2) {
min2Vocab += (Count)*ngramCount;
}
if (type >= 3) {
min3Vocab += (Count)*ngramCount;
}
}
}
}
if (i > 1 && trustTotals()) {
totalCount = *contextCount;
}
if (totalCount == 0) {
continue;
}
/*
* reverse the context ngram since that's how
* the BO nodes are indexed.
*/
Vocab::reverse(context);
/*
* Compute the discounted probabilities
* from the counts and store them in the backoff model.
*/
retry:
followIter.init();
Prob totalProb = 0.0;
while (ngramCount = followIter.next()) {
LogP lprob;
double discount;
/*
* Ignore zero counts.
* They are there just as an artifact of the count trie
* if a higher order ngram has a non-zero count.
*/
if (i > 1 && *ngramCount == 0) {
continue;
}
if (vocab.isNonEvent(word[0]) || vocab.isMetaTag(word[0])) {
/*
* Discard all pseudo-word probabilities,
* except for unigrams. For unigrams, assign
* probability zero. This will leave them with
* prob zero in all cases, due to the backoff
* algorithm.
* Also discard the <unk> token entirely in closed
* vocab models, its presence would prevent OOV
* detection when the model is read back in.
*/
if (i > 1 ||
word[0] == vocab.unkIndex() ||
vocab.isMetaTag(word[0]))
{
noneventNgrams ++;
continue;
}
lprob = LogP_Zero;
discount = 1.0;
} else {
/*
* Ths discount array passed may contain 0 elements
* to indicate no discounting at this order.
*/
if (noDiscount) {
discount = 1.0;
} else {
discount =
discounts[i-1]->discount(*ngramCount, totalCount,
observedVocab);
}
Prob prob = (discount * *ngramCount) / totalCount;
/*
* For interpolated estimates we compute the weighted
* linear combination of the high-order estimate
* (computed above) and the lower-order estimate.
* The high-order weight is given by the discount factor,
* the lower-order weight is obtained from the Discount
* method (it may be 0 if the method doesn't support
* interpolation).
*/
double lowerOrderWeight;
LogP lowerOrderProb;
if (interpolate) {
lowerOrderWeight =
discounts[i-1]->lowerOrderWeight(totalCount,
observedVocab,
min2Vocab,
min3Vocab);
if (i > 1) {
lowerOrderProb = wordProbBO(word[0], context, i-2);
} else {
lowerOrderProb = (LogP)(- log10((double)vocabSize));
}
prob += (Prob)(lowerOrderWeight * LogPtoProb(lowerOrderProb));
}
lprob = ProbToLogP(prob);
if (discount != 0.0) {
totalProb += prob;
}
if (discount != 0.0 && debug(DEBUG_ESTIMATES)) {
dout() << "CONTEXT " << (vocab.use(), context)
<< " WORD " << vocab.getWord(word[0])
<< " NUMER " << *ngramCount
<< " DENOM " << totalCount
<< " DISCOUNT " << discount;
if (interpolate) {
dout() << " LOW " << lowerOrderWeight
<< " LOLPROB " << lowerOrderProb;
}
dout() << " LPROB " << lprob << endl;
}
}
/*
* A discount coefficient of zero indicates this ngram
* should be omitted entirely (presumably to save space).
*/
if (discount == 0.0) {
discountedNgrams ++;
removeProb(word[0], context);
} else {
*insertProb(word[0], context) = lprob;
}
}
/*
* This is a hack credited to Doug Paul (by Roni Rosenfeld in
* his CMU tools). It may happen that no probability mass
* is left after totalling all the explicit probs, typically
* because the discount coefficients were out of range and
* forced to 1.0. To arrive at some non-zero backoff mass
* we try incrementing the denominator in the estimator by 1.
*/
if (!noDiscount && totalCount > 0 &&
totalProb > 1.0 - Prob_Epsilon)
{
totalCount += 1;
if (debug(DEBUG_ESTIMATE_WARNINGS)) {
cerr << "warning: " << (1.0 - totalProb)
<< " backoff probability mass left for \""
<< (vocab.use(), context)
<< "\" -- incrementing denominator"
<< endl;
}
goto retry;
}
/*
* Undo the reversal above so the iterator can continue correctly
*/
Vocab::reverse(context);
}
if (debug(DEBUG_ESTIMATE_WARNINGS)) {
if (noneventContexts > 0) {
dout() << "discarded " << noneventContexts << " "
<< i << "-gram contexts containing pseudo-events\n";
}
if (noneventNgrams > 0) {
dout() << "discarded " << noneventNgrams << " "
<< i << "-gram probs predicting pseudo-events\n";
}
if (discountedNgrams > 0) {
dout() << "discarded " << discountedNgrams << " "
<< i << "-gram probs discounted to zero\n";
}
}
/*
* With all the probs in place, BOWs are obtained simply by the usual
* normalization.
* We do this right away before computing probs of higher order since
* the estimation of higher-order N-grams can refer to lower-order
* ones (e.g., for interpolated estimates).
*/
computeBOWs(i-1);
}
fixupProbs();
return true;
}
Boolean
Ngram::estimate(NgramStats &stats, Discount **discounts)
{
return estimate2(stats, discounts);
}
Boolean
Ngram::estimate(NgramCounts<FloatCount> &stats, Discount **discounts)
{
return estimate2(stats, discounts);
}
/*
* Mixing backoff models
* the first version mixes an existing models destructively with
* a second one
* the second version mixes two existing models non-destructively leaving
* the result in *this.
* lambda is the weight of the first input model.
*/
void
Ngram::mixProbs(Ngram &lm2, double lambda)
{
makeArray(VocabIndex, context, order + 1);
/*
* In destructive merging we need to process the longer ngrams first
* so that we can still use the model being modified to compute its own
* original probability estimates.
*/
for (int i = order - 1; i >= 0 ; i--) {
BOnode *node;
NgramBOsIter iter1(*this, context, i);
/*
* First, find all explicit ngram probs in *this, and mix them
* with the corresponding probs of lm2 (explicit or backed-off).
*/
while (node = iter1.next()) {
NgramProbsIter piter(*node);
VocabIndex word;
LogP *prob1;
while (prob1 = piter.next(word)) {
*prob1 =
MixLogP(*prob1, lm2.wordProbBO(word, context, i), lambda);
}
}
/*
* Do the same for lm2, except we dont't need to recompute
* those cases that were already handled above (explicit probs
* in both *this and lm2).
*/
NgramBOsIter iter2(lm2, context, i);
while (node = iter2.next()) {
NgramProbsIter piter(*node);
VocabIndex word;
LogP *prob2;
while (prob2 = piter.next(word)) {
if (!findProb(word, context)) {
LogP mixProb =
MixLogP(wordProbBO(word, context, i), *prob2, lambda);
*insertProb(word, context) = mixProb;
}
}
}
}
recomputeBOWs();
}
void
Ngram::mixProbs(Ngram &lm1, Ngram &lm2, double lambda)
{
makeArray(VocabIndex, context, order + 1);
for (unsigned i = 0; i < order; i++) {
BOnode *node;
NgramBOsIter iter1(lm1, context, i);
/*
* First, find all explicit ngram probs in lm1, and mix them
* with the corresponding probs of lm2 (explicit or backed-off).
*/
while (node = iter1.next()) {
NgramProbsIter piter(*node);
VocabIndex word;
LogP *prob1;
while (prob1 = piter.next(word)) {
*insertProb(word, context) =
MixLogP(*prob1, lm2.wordProbBO(word, context, i), lambda);
}
}
/*
* Do the same for lm2, except we dont't need to recompute
* those cases that were already handled above (explicit probs
* in both lm1 and lm2).
*/
NgramBOsIter iter2(lm2, context, i);
while (node = iter2.next()) {
NgramProbsIter piter(*node);
VocabIndex word;
LogP *prob2;
while (prob2 = piter.next(word)) {
if (!lm1.findProb(word, context)) {
*insertProb(word, context) =
MixLogP(lm1.wordProbBO(word, context, i), *prob2, lambda);
}
}
}
}
recomputeBOWs();
}
/*
* Compute the numerator and denominator of a backoff weight,
* checking for sanity. Returns true if values make sense,
* and prints a warning if not.
*/
Boolean
Ngram::computeBOW(BOnode *node, const VocabIndex *context, unsigned clen,
Prob &numerator, Prob &denominator)
{
NgramProbsIter piter(*node);
VocabIndex word;
LogP *prob;
/*
* The BOW(c) for a context c is computed to be
*
* BOW(c) = (1 - Sum p(x | c)) / (1 - Sum p_BO(x | c))
*
* where Sum is a summation over all words x with explicit probabilities
* in context c, p(x|c) is that probability, and p_BO(x|c) is the
* probability for that word according to the backoff algorithm.
*/
numerator = 1.0;
denominator = 1.0;
while (prob = piter.next(word)) {
numerator -= LogPtoProb(*prob);
if (clen > 0) {
denominator -=
LogPtoProb(wordProbBO(word, context, clen - 1));
}
}
/*
* Avoid some predictable anomalies due to rounding errors
*/
if (numerator < 0.0 && numerator > -Prob_Epsilon) {
numerator = 0.0;
}
if (denominator < 0.0 && denominator > -Prob_Epsilon) {
denominator = 0.0;
}
if (denominator == 0.0 && numerator > Prob_Epsilon) {
/*
* Backoff distribution has no probability left. To avoid wasting
* probability mass scale the N-gram probabilities to sum to 1.
*/
cerr << "BOW denominator for context \""
<< (vocab.use(), context)
<< "\" is zero; scaling probabilities to sum to 1\n";
LogP scale = - ProbToLogP(1 - numerator);
piter.init();
while (prob = piter.next(word)) {
*prob += scale;
}
numerator = 0.0;
return true;
} else if (numerator < 0.0) {
cerr << "BOW numerator for context \""
<< (vocab.use(), context)
<< "\" is " << numerator << " < 0\n";
return false;
} else if (denominator <= 0.0) {
if (numerator > Prob_Epsilon) {
cerr << "BOW denominator for context \""
<< (vocab.use(), context)
<< "\" is " << denominator << " <= 0,"
<< "numerator is " << numerator
<< endl;
return false;
} else {
numerator = 0.0;
denominator = 0.0; /* will give bow = 0 */
return true;
}
} else {
return true;
}
}
/*
* Recompute backoff weight for all contexts of a given order
*/
Boolean
Ngram::computeBOWs(unsigned order)
{
Boolean result = true;
/*
* Note that this will only generate backoff nodes for those
* contexts that have words with explicit probabilities. But
* this is precisely as it should be.
*/
BOnode *node;
makeArray(VocabIndex, context, order + 1);
NgramBOsIter iter1(*this, context, order);
while (node = iter1.next()) {
NgramProbsIter piter(*node);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -