📄 ngramlm.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 4 页
字号:
	    CountType totalCount = 0;
	    Count observedVocab = 0, min2Vocab = 0, min3Vocab = 0;
	    while (ngramCount = followIter.next()) {
		if (vocab.isNonEvent(word[0]) ||
		    ngramCount == 0 ||
		    i == 1 && vocab.isMetaTag(word[0]))
		{
		    continue;
		}

		if (!vocab.isMetaTag(word[0])) {
		    totalCount += *ngramCount;
		    observedVocab ++;
		    if (*ngramCount >= 2) {
			min2Vocab ++;
		    }
		    if (*ngramCount >= 3) {
			min3Vocab ++;
		    }
		} else {
		    /*
		     * Process meta-counts
		     */
		    unsigned type = vocab.typeOfMetaTag(word[0]);
		    if (type == 0) {
			/*
			 * a count total: just add to the totalCount
			 * the corresponding type count can't be known,
			 * but it has to be at least 1
			 */
			totalCount += *ngramCount;
			observedVocab ++;
		    } else {
			/*
			 * a count-of-count: increment the word type counts,
			 * and infer the totalCount
			 */
			totalCount += type * *ngramCount;
			observedVocab += (Count)*ngramCount;
			if (type >= 2) {
			    min2Vocab += (Count)*ngramCount;
			}
			if (type >= 3) {
			    min3Vocab += (Count)*ngramCount;
			}
		    }
		}
	    }

	    if (i > 1 && trustTotals()) {
		totalCount = *contextCount;
	    }

	    if (totalCount == 0) {
		continue;
	    }

	    /*
	     * reverse the context ngram since that's how
	     * the BO nodes are indexed.
	     */
	    Vocab::reverse(context);

	    /*
	     * Compute the discounted probabilities
	     * from the counts and store them in the backoff model.
	     */
	retry:
	    followIter.init();
	    Prob totalProb = 0.0;

	    while (ngramCount = followIter.next()) {
		LogP lprob;
		double discount;

		/*
		 * Ignore zero counts.
		 * They are there just as an artifact of the count trie
		 * if a higher order ngram has a non-zero count.
		 */
		if (i > 1 && *ngramCount == 0) {
		    continue;
		}

		if (vocab.isNonEvent(word[0]) || vocab.isMetaTag(word[0])) {
		    /*
		     * Discard all pseudo-word probabilities,
		     * except for unigrams.  For unigrams, assign
		     * probability zero.  This will leave them with
		     * prob zero in all cases, due to the backoff
		     * algorithm.
		     * Also discard the <unk> token entirely in closed
		     * vocab models, its presence would prevent OOV
		     * detection when the model is read back in.
		     */
		    if (i > 1 ||
			word[0] == vocab.unkIndex() ||
			vocab.isMetaTag(word[0]))
		    {
			noneventNgrams ++;
			continue;
		    }

		    lprob = LogP_Zero;
		    discount = 1.0;
		} else {
		    /*
		     * Ths discount array passed may contain 0 elements
		     * to indicate no discounting at this order.
		     */
		    if (noDiscount) {
			discount = 1.0;
		    } else {
			discount =
			    discounts[i-1]->discount(*ngramCount, totalCount,
								observedVocab);
		    }
		    Prob prob = (discount * *ngramCount) / totalCount;

		    /*
		     * For interpolated estimates we compute the weighted 
		     * linear combination of the high-order estimate
		     * (computed above) and the lower-order estimate.
		     * The high-order weight is given by the discount factor,
		     * the lower-order weight is obtained from the Discount
		     * method (it may be 0 if the method doesn't support
		     * interpolation).
		     */
		    double lowerOrderWeight;
		    LogP lowerOrderProb;
		    if (interpolate) {
			lowerOrderWeight = 
			    discounts[i-1]->lowerOrderWeight(totalCount,
							     observedVocab,
							     min2Vocab,
							     min3Vocab);
			if (i > 1) {
			    lowerOrderProb = wordProbBO(word[0], context, i-2);
			} else {
			    lowerOrderProb = (LogP)(- log10((double)vocabSize));
			}

			prob += (Prob)(lowerOrderWeight * LogPtoProb(lowerOrderProb));
		    }

		    lprob = ProbToLogP(prob);

		    if (discount != 0.0) {
			totalProb += prob;
		    }

		    if (discount != 0.0 && debug(DEBUG_ESTIMATES)) {
			dout() << "CONTEXT " << (vocab.use(), context)
			       << " WORD " << vocab.getWord(word[0])
			       << " NUMER " << *ngramCount
			       << " DENOM " << totalCount
			       << " DISCOUNT " << discount;

			if (interpolate) {
			    dout() << " LOW " << lowerOrderWeight
				   << " LOLPROB " << lowerOrderProb;
			}
			dout() << " LPROB " << lprob << endl;
		    }
		}
		    
		/*
		 * A discount coefficient of zero indicates this ngram
		 * should be omitted entirely (presumably to save space).
		 */
		if (discount == 0.0) {
		    discountedNgrams ++;
		    removeProb(word[0], context);
		} else {
		    *insertProb(word[0], context) = lprob;
		} 
	    }

	    /*
	     * This is a hack credited to Doug Paul (by Roni Rosenfeld in
	     * his CMU tools).  It may happen that no probability mass
	     * is left after totalling all the explicit probs, typically
	     * because the discount coefficients were out of range and
	     * forced to 1.0.  To arrive at some non-zero backoff mass
	     * we try incrementing the denominator in the estimator by 1.
	     */
	    if (!noDiscount && totalCount > 0 &&
		totalProb > 1.0 - Prob_Epsilon)
	    {
		totalCount += 1;

		if (debug(DEBUG_ESTIMATE_WARNINGS)) {
		    cerr << "warning: " << (1.0 - totalProb)
			 << " backoff probability mass left for \""
			 << (vocab.use(), context)
			 << "\" -- incrementing denominator"
			 << endl;
		}
		goto retry;
	    }

	    /*
	     * Undo the reversal above so the iterator can continue correctly
	     */
	    Vocab::reverse(context);
	}

	if (debug(DEBUG_ESTIMATE_WARNINGS)) {
	    if (noneventContexts > 0) {
		dout() << "discarded " << noneventContexts << " "
		       << i << "-gram contexts containing pseudo-events\n";
	    }
	    if (noneventNgrams > 0) {
		dout() << "discarded " << noneventNgrams << " "
		       << i << "-gram probs predicting pseudo-events\n";
	    }
	    if (discountedNgrams > 0) {
		dout() << "discarded " << discountedNgrams << " "
		       << i << "-gram probs discounted to zero\n";
	    }
	}

	/*
	 * With all the probs in place, BOWs are obtained simply by the usual
	 * normalization.
	 * We do this right away before computing probs of higher order since 
	 * the estimation of higher-order N-grams can refer to lower-order
	 * ones (e.g., for interpolated estimates).
	 */
	computeBOWs(i-1);
    }

    fixupProbs();

    return true;
}

Boolean
Ngram::estimate(NgramStats &stats, Discount **discounts)
{
    return estimate2(stats, discounts);
}

Boolean
Ngram::estimate(NgramCounts<FloatCount> &stats, Discount **discounts)
{
    return estimate2(stats, discounts);
}

/*
 * Mixing backoff models
 *	the first version mixes an existing models destructively with
 *	a second one
 *	the second version mixes two existing models non-destructively leaving
 *	the result in *this.
 *	lambda is the weight of the first input model.
 */
void
Ngram::mixProbs(Ngram &lm2, double lambda)
{
    makeArray(VocabIndex, context, order + 1);

    /*
     * In destructive merging we need to process the longer ngrams first
     * so that we can still use the model being modified to compute its own
     * original probability estimates.
     */
    for (int i = order - 1; i >= 0 ; i--) {
	BOnode *node;
	NgramBOsIter iter1(*this, context, i);
	
	/*
	 * First, find all explicit ngram probs in *this, and mix them
	 * with the corresponding probs of lm2 (explicit or backed-off).
	 */
	while (node = iter1.next()) {
	    NgramProbsIter piter(*node);
	    VocabIndex word;
	    LogP *prob1;

	    while (prob1 = piter.next(word)) {
		*prob1 =
		    MixLogP(*prob1, lm2.wordProbBO(word, context, i), lambda);
	    }
	}

	/*
	 * Do the same for lm2, except we dont't need to recompute 
	 * those cases that were already handled above (explicit probs
	 * in both *this and lm2).
	 */
	NgramBOsIter iter2(lm2, context, i);

	while (node = iter2.next()) {
	    NgramProbsIter piter(*node);
	    VocabIndex word;
	    LogP *prob2;

	    while (prob2 = piter.next(word)) {
		if (!findProb(word, context)) {
		    LogP mixProb =
			MixLogP(wordProbBO(word, context, i), *prob2, lambda);
		    *insertProb(word, context) = mixProb;
		}
	    }
	}
    }

    recomputeBOWs();
}

void
Ngram::mixProbs(Ngram &lm1, Ngram &lm2, double lambda)
{
    makeArray(VocabIndex, context, order + 1);

    for (unsigned i = 0; i < order; i++) {
	BOnode *node;
	NgramBOsIter iter1(lm1, context, i);
	
	/*
	 * First, find all explicit ngram probs in lm1, and mix them
	 * with the corresponding probs of lm2 (explicit or backed-off).
	 */
	while (node = iter1.next()) {
	    NgramProbsIter piter(*node);
	    VocabIndex word;
	    LogP *prob1;

	    while (prob1 = piter.next(word)) {
		*insertProb(word, context) =
		    MixLogP(*prob1, lm2.wordProbBO(word, context, i), lambda);
	    }
	}

	/*
	 * Do the same for lm2, except we dont't need to recompute 
	 * those cases that were already handled above (explicit probs
	 * in both lm1 and lm2).
	 */
	NgramBOsIter iter2(lm2, context, i);

	while (node = iter2.next()) {
	    NgramProbsIter piter(*node);
	    VocabIndex word;
	    LogP *prob2;

	    while (prob2 = piter.next(word)) {
		if (!lm1.findProb(word, context)) {
		    *insertProb(word, context) =
		      MixLogP(lm1.wordProbBO(word, context, i), *prob2, lambda);
		}
	    }
	}
    }

    recomputeBOWs();
}

/*
 * Compute the numerator and denominator of a backoff weight,
 * checking for sanity.  Returns true if values make sense,
 * and prints a warning if not.
 */
Boolean
Ngram::computeBOW(BOnode *node, const VocabIndex *context, unsigned clen,
				Prob &numerator, Prob &denominator)
{
    NgramProbsIter piter(*node);
    VocabIndex word;
    LogP *prob;

    /*
     * The BOW(c) for a context c is computed to be
     *
     *	BOW(c) = (1 - Sum p(x | c)) /  (1 - Sum p_BO(x | c))
     *
     * where Sum is a summation over all words x with explicit probabilities
     * in context c, p(x|c) is that probability, and p_BO(x|c) is the 
     * probability for that word according to the backoff algorithm.
     */
    numerator = 1.0;
    denominator = 1.0;

    while (prob = piter.next(word)) {
	numerator -= LogPtoProb(*prob);
	if (clen > 0) {
	    denominator -=
		LogPtoProb(wordProbBO(word, context, clen - 1));
	}
    }

    /*
     * Avoid some predictable anomalies due to rounding errors
     */
    if (numerator < 0.0 && numerator > -Prob_Epsilon) {
	numerator = 0.0;
    }
    if (denominator < 0.0 && denominator > -Prob_Epsilon) {
	denominator = 0.0;
    }

    if (denominator == 0.0 && numerator > Prob_Epsilon) {
	/* 
	 * Backoff distribution has no probability left.  To avoid wasting
	 * probability mass scale the N-gram probabilities to sum to 1.
	 */
	cerr << "BOW denominator for context \""
		 << (vocab.use(), context)
		 << "\" is zero; scaling probabilities to sum to 1\n";

	LogP scale = - ProbToLogP(1 - numerator);

	piter.init();
	while (prob = piter.next(word)) {
	    *prob += scale;
	}

	numerator = 0.0;
	return true;
    } else if (numerator < 0.0) {
	cerr << "BOW numerator for context \""
	     << (vocab.use(), context)
	     << "\" is " << numerator << " < 0\n";
	return false;
    } else if (denominator <= 0.0) {
	if (numerator > Prob_Epsilon) {
	    cerr << "BOW denominator for context \""
		 << (vocab.use(), context)
		 << "\" is " << denominator << " <= 0,"
		 << "numerator is " << numerator
		 << endl;
	    return false;
	} else {
	    numerator = 0.0;
	    denominator = 0.0;	/* will give bow = 0 */
	    return true;
	}
    } else {
	return true;
    }
}

/*
 * Recompute backoff weight for all contexts of a given order
 */
Boolean
Ngram::computeBOWs(unsigned order)
{
    Boolean result = true;

    /*
     * Note that this will only generate backoff nodes for those
     * contexts that have words with explicit probabilities.  But
     * this is precisely as it should be.
     */
    BOnode *node;
    makeArray(VocabIndex, context, order + 1);

    NgramBOsIter iter1(*this, context, order);
    
    while (node = iter1.next()) {
	NgramProbsIter piter(*node);
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -