📄 ngramlm.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
	double numerator, denominator;

	if (computeBOW(node, context, order, numerator, denominator)) {
	    /*
	     * If unigram probs leave a non-zero probability mass
	     * then we should give that mass to the zero-order (uniform)
	     * distribution for zeroton words.  However, the ARPA
	     * BO format doesn't support a "null context" BOW.
	     * We simluate the intended distribution by spreading the
	     * left-over mass uniformly over all vocabulary items that
	     * have a zero probability.
	     * NOTE: We used to do this only if there was prob mass left,
	     * but some ngram software requires all words to appear as
	     * unigrams, which we achieve by giving them zero probability.
	     */
	    if (order == 0 /*&& numerator > 0.0*/) {
		distributeProb(numerator, context);
	    } else if (numerator == 0.0 && denominator == 0) {
		node->bow = LogP_One;
	    } else {
		node->bow = ProbToLogP(numerator) - ProbToLogP(denominator);
	    }
	} else {
	    /*
	     * Dummy value for improper models
	     */
	    node->bow = LogP_Zero;
	    result = false;
	}
    }

    return result;
}

/*
 * Renormalize language model by recomputing backoff weights.
 */
void
Ngram::recomputeBOWs()
{
    /*
     * Here it is important that we compute the backoff weights in
     * increasing order, since the higher-order ones refer to the
     * lower-order ones in the backoff algorithm.
     * Note that this will only generate backoff nodes for those
     * contexts that have words with explicit probabilities.  But
     * this is precisely as it should be.
     */
    for (unsigned i = 0; i < order; i++) {
	computeBOWs(i);
    }
}

/*
 * Prune probabilities from model so that the change in training set perplexity
 * is below a threshold.
 * The minorder parameter limits pruning to ngrams of that length and above.
 */
void
Ngram::pruneProbs(double threshold, unsigned minorder)
{
    /*
     * Hack alert: allocate the context buffer for NgramBOsIter, but leave
     * room for a word id to be prepended.
     */
    makeArray(VocabIndex, wordPlusContext, order + 2);
    VocabIndex *context = &wordPlusContext[1];

    for (unsigned i = order - 1; i > 0 && i >= minorder - 1; i--) {
	unsigned prunedNgrams = 0;

	BOnode *node;
	NgramBOsIter iter1(*this, context, i);
	
	while (node = iter1.next()) {
	    LogP bow = node->bow;	/* old backoff weight, BOW(h) */
	    double numerator, denominator;

	    /* 
	     * Compute numerator and denominator of the backoff weight,
	     * so that we can quickly compute the BOW adjustment due to
	     * leaving out one prob.
	     */
	    if (!computeBOW(node, context, i, numerator, denominator)) {
		continue;
	    }

	    /*
	     * Compute the marginal probability of the context, P(h)
	     */
	    LogP cProb = contextProb(context, i);

	    NgramProbsIter piter(*node);
	    VocabIndex word;
	    LogP *ngramProb;

	    Boolean allPruned = true;

	    while (ngramProb = piter.next(word)) {
		/*
		 * lower-order estimate for ngramProb, P(w|h')
		 */
		LogP backoffProb = wordProbBO(word, context, i - 1);

		/*
		 * Compute BOW after removing ngram, BOW'(h)
		 */
		LogP newBOW =
			ProbToLogP(numerator + LogPtoProb(*ngramProb)) -
			ProbToLogP(denominator + LogPtoProb(backoffProb));

		/*
		 * Compute change in entropy due to removal of ngram
		 * deltaH = - P(H) x
		 *  {P(W | H) [log P(w|h') + log BOW'(h) - log P(w|h)] +
		 *   (1 - \sum_{v,h ngrams} P(v|h)) [log BOW'(h) - log BOW(h)]}
		 *
		 * (1-\sum_{v,h ngrams}) is the probability mass left over from
		 * ngrams of the current order, and is the same as the 
		 * numerator in BOW(h).
		 */
		LogP deltaProb = backoffProb + newBOW - *ngramProb;
		Prob deltaEntropy = - LogPtoProb(cProb) *
					(LogPtoProb(*ngramProb) * deltaProb +
					 numerator * (newBOW - bow));

		/*
		 * compute relative change in model (training set) perplexity
		 *	(PPL' - PPL)/PPL = PPL'/PPL - 1
		 *	                 = exp(H')/exp(H) - 1
		 *	                 = exp(H' - H) - 1
		 */
		double perpChange = LogPtoProb(deltaEntropy) - 1.0;

		Boolean pruned = threshold > 0 && perpChange < threshold;

		/*
		 * Make sure we don't prune ngrams whose backoff nodes are
		 * needed ...
		 */
		if (pruned) {
		    /*
		     * wordPlusContext[1 ... i-1] was already filled in by
		     * NgramBOIter. Just add the word to complete the ngram.
		     */
		    wordPlusContext[0] = word;

		    if (findBOW(wordPlusContext)) {
			pruned = false;
		    }
		}

		if (debug(DEBUG_ESTIMATES)) {
		    dout() << "CONTEXT " << (vocab.use(), context)
			   << " WORD " << vocab.getWord(word)
			   << " CONTEXTPROB " << cProb
			   << " OLDPROB " << *ngramProb
			   << " NEWPROB " << (backoffProb + newBOW)
			   << " DELTA-H " << deltaEntropy
			   << " DELTA-LOGP " << deltaProb
			   << " PPL-CHANGE " << perpChange
			   << " PRUNED " << pruned
			   << endl;
		}

		if (pruned) {
		    removeProb(word, context);
		    prunedNgrams ++;
		} else {
		    allPruned = false;
		}
	    }

	    /*
	     * If we removed all ngrams for this context we can 
	     * remove the context itself.
	     * Note: Due to the check above this also means there
	     * are no contexts that extend the current one, so
	     * removing this node won't leave any children orphaned.
	     */
	    if (allPruned) {
		removeBOW(context);
	    }
	}

	if (debug(DEBUG_ESTIMATE_WARNINGS)) {
	    if (prunedNgrams > 0) {
		dout() << "pruned " << prunedNgrams << " "
		       << (i + 1) << "-grams\n";
	    }
	}
    }

    recomputeBOWs();
}

/*
 * Prune low probability N-grams
 *	Removes all N-gram probabilities that are lower than the
 *	corresponding backed-off estimates.  This is a crucial preprocessing
 *	step for N-gram models before conversion to finite-state networks.
 */
void
Ngram::pruneLowProbs(unsigned minorder)
{
    makeArray(VocabIndex, context, order);

    Boolean havePruned;
    
    /*
     * Since pruning changes backoff weights we have to keep pruning
     * interatively as long as N-grams keep being removed.
     */
    do {
	havePruned = false;

	for (unsigned i = minorder - 1; i < order; i++) {
	    unsigned prunedNgrams = 0;

	    BOnode *node;
	    NgramBOsIter iter1(*this, context, i);
	    
	    while (node = iter1.next()) {
		LogP bow = node->bow;	/* old backoff weight, BOW(h) */

		NgramProbsIter piter(*node);
		VocabIndex word;
		LogP *ngramProb;

		while (ngramProb = piter.next(word)) {
		    /*
		     * lower-order estimate for ngramProb, P(w|h')
		     */
		    LogP backoffProb = wordProbBO(word, context, i - 1);

		    if (backoffProb + bow > *ngramProb) {

			if (debug(DEBUG_ESTIMATES)) {
			    dout() << "CONTEXT " << (vocab.use(), context)
				   << " WORD " << vocab.getWord(word)
				   << " LPROB " << *ngramProb
				   << " BACKOFF-LPROB " << (backoffProb + bow)
				   << " PRUNED\n";
			}

			removeProb(word, context);
			prunedNgrams ++;
		    }
		}
	    }

	    if (prunedNgrams > 0) {
		havePruned = true;
		if (debug(DEBUG_ESTIMATE_WARNINGS)) {
		    dout() << "pruned " << prunedNgrams << " "
			   << (i + 1) << "-grams\n";
		}
	    }
	}
	recomputeBOWs();

    } while (havePruned);

    fixupProbs();
}

/* 
 * Reassign ngram probabilities using a different LM
 */
void
Ngram::rescoreProbs(LM &lm)
{
    makeArray(VocabIndex, context, order + 1);

    for (int i = order - 1; i >= 0 ; i--) {
	BOnode *node;
	NgramBOsIter iter(*this, context, i);
	
	/*
	 * Enumerate all explicit ngram probs in *this, and recompute their
	 * probs using the supplied LM.
	 */
	while (node = iter.next()) {
	    NgramProbsIter piter(*node);
	    VocabIndex word;
	    LogP *prob;

	    while (prob = piter.next(word)) {
		*prob = lm.wordProb(word, context);
	    }
	}
    }

    recomputeBOWs();
}

/*
 * Insert probabilities for all context ngrams
 *
 * 	The ARPA format forces us to have a probability
 * 	for every context ngram.  So we create one if necessary and
 * 	set the probability to the same as what the backoff algorithm
 * 	would compute (so as not to change the distribution).
 */
void
Ngram::fixupProbs()
{
    makeArray(VocabIndex, context, order + 1);

    /*
     * we cannot insert entries into the context trie while we're
     * iterating over it, so we need another trie to collect
     * the affected contexts first. yuck.
     * Note: We use the same trie type as in the Ngram model itself,
     * so we don't need to worry about another template instantiation.
     */
    Trie<VocabIndex,NgramCount> contextsToAdd;

    /*
     * Find the contexts xyz that don't have a probability p(z|xy) in the
     * model already.
     */
    unsigned i;
    for (i = 1; i < order; i++) {
	NgramBOsIter iter(*this, context, i);
	
	while (iter.next()) {
	    /*
	     * For a context abcd we need to create probability entries
	     * p(d|abc), p(c|ab), p(b|a), p(a) if necessary.
	     * If any of these is found in that order we can stop since a
	     * previous pass will already have created the remaining ones.
	     */
	    for (unsigned j = 0; j < i; j++) {
		LogP *prob = findProb(context[j], &context[j+1]);

		if (prob) {
		    break;
		} else {
		    /*
		     * Just store something non-zero to distinguish from
		     * internal nodes that are added implicitly.
		     */
		    *contextsToAdd.insert(&context[j]) = 1;
		}
	    }
	}
    }


    /*
     * Store the missing probs
     */
    for (i = 1; i < order; i++) {
	unsigned numFakes = 0;

	Trie<VocabIndex,NgramCount> *node;
	TrieIter2<VocabIndex,NgramCount> iter(contextsToAdd, context, i);
	

	while (node = iter.next()) {
	    if (node->value()) {
		numFakes ++;

		/*
		 * Note: we cannot combine the two statements below
		 * since insertProb() creates a zero prob entry, which would
		 * prevent wordProbBO() from computing the backed-off 
		 * estimate!
		 */
		LogP backoffProb = wordProbBO(context[0], &context[1], i - 1);
		*insertProb(context[0], &(context[1])) = backoffProb;

		if (debug(DEBUG_FIXUP_WARNINGS)) {
		    dout() << "faking probability for context "
			   << (vocab.use(), context) << endl;
		}
	    }
	}

	if (debug(DEBUG_ESTIMATE_WARNINGS)) {
	    if (numFakes > 0) {
		dout() << "inserted " << numFakes << " redundant " 
		       << i << "-gram probs\n";
	    }
	}
    }
}

/*
 * Redistribute probability mass over ngrams of given context
 */
void
Ngram::distributeProb(Prob mass, VocabIndex *context)
{
    /*
     * First enumerate the vocabulary to count the number of
     * items affected
     */
    unsigned numWords = 0;
    unsigned numZeroProbs = 0;
    VocabIter viter(vocab);
    VocabIndex word;

    while (viter.next(word)) {
	if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) {
	    numWords ++;

	    LogP *prob = findProb(word, context);
	    if (!prob || *prob == LogP_Zero) {
		numZeroProbs ++;
	    }
	    /*
	     * create zero probs so we can update them below
	     */
	    if (!prob) {
		*insertProb(word, context) = LogP_Zero;
	    }
	}
    }

    /*
     * If there are no zero-probability words 
     * then we add the left-over prob mass to all unigrams.
     * Otherwise do as described above.
     */
    viter.init();

    if (numZeroProbs > 0) {
	if (debug(DEBUG_ESTIMATE_WARNINGS)) {
	    cerr << "warning: distributing " << mass
		 << " left-over probability mass over "
		 << numZeroProbs << " zeroton words" << endl;
	}
	Prob add = mass / numZeroProbs;

	while (viter.next(word)) {
	    if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) {
		LogP *prob = insertProb(word, context);
		if (*prob == LogP_Zero) {
		    *prob = ProbToLogP(add);
		}
	    }
	}
    } else {
	if (mass > 0.0 && debug(DEBUG_ESTIMATE_WARNINGS)) {
	    cerr << "warning: distributing " << mass
		 << " left-over probability mass over all "
		 << numWords << " words" << endl;
	}
	Prob add = mass / numWords;

	while (viter.next(word)) {
	    if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) {
		LogP *prob = insertProb(word, context);
		*prob = ProbToLogP(LogPtoProb(*prob) + add);
	    }
	}
    }
}
上一页 1 2 34
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -