latticengrams.cc

来自「这是一款很好用的工具包」· CC 代码 · 共 207 行

207 行

/*
 * LatticeNgrams.cc --
 *	Expected N-gram count computation on lattices
 *
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 2005-2006 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lattice/src/RCS/LatticeNgrams.cc,v 1.7 2006/01/06 05:35:13 stolcke Exp $";
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include "Lattice.h"

#include "LHash.cc"
#include "Map2.cc"
#include "Array.cc"

#ifdef INSTANTIATE_TEMPLATES
// reused instantiation in VocabMultiMap.cc
#ifdef USE_SHORT_VOCAB
INSTANTIATE_MAP2(NodeIndex, VocabContext, LogP2);
#endif
#endif

#define DebugPrintFatalMessages         0 

#define DebugPrintFunctionality         1 
// for large functionality listed in options of the program
#define DebugPrintOutLoop               2
// for out loop of the large functions or small functions
#define DebugPrintInnerLoop             3
// for inner loop of the large functions or outloop of small functions


/*
 * Compute N-gram count expectations over a lattice.
 * Returns expected unigram count sum.
 *
 * Algorithm: 
 *	1. compute backward probabilities
 *	2. compute forward probabilities while dynamically expanding
 *	   lattice to reflect (N-1)-gram context nodes
 *	   (the forward probabilities for each expanded node are stored,
 *	   but not the expanded lattice itself).
 *
 * The main data structure is a Map2 of the form
 *	(original node, Ngram-context) -> forward probability
 * The forward probs are combined with transition and backward probs to
 * compute expected counts, which are accumulated in a global N-gram trie.
 *
 * Nodes with NULL or pause are handled by ignoring them when in the final
 * position of the N-gram, and skipping them when computing N-gram contexts.
 */
FloatCount
Lattice::countNgramsAtNode(VocabIndex oldIndex, unsigned order,
			   NgramCounts<FloatCount> &counts,
			   LogP2 backwardProbs[], double posteriorScale,
			   Map2<NodeIndex, VocabContext, LogP2> &forwardProbMap)
{
    LatticeNode *oldNode = findNode(oldIndex);
    assert(oldNode != 0);

    Map2Iter2<NodeIndex, VocabContext, LogP2>
					expandIter(forwardProbMap, oldIndex);
    Prob countSum = 0.0;

    LogP2 *forwardProb;
    VocabContext context;

    while (forwardProb = expandIter.next(context)) {
	unsigned contextLength = Vocab::length(context);

	makeArray(VocabIndex, extendedContext, contextLength + 2);
	Vocab::copy(extendedContext, context);

	TRANSITER_T<NodeIndex,LatticeTransition> 
			      transIter(oldNode->outTransitions);
	NodeIndex nextIndex;
	while (LatticeTransition *oldTrans = transIter.next(nextIndex)) {
	    LatticeNode *nextNode = findNode(nextIndex);
	    assert(nextNode != 0);

	    VocabIndex word = nextNode->word;

	    LogP transProb = oldTrans->weight/posteriorScale;
	    LogP2 backwardProb = backwardProbs[nextIndex];

	    LogP posterior = *forwardProb + transProb + backwardProb;

	    // normalize by posterior of whole lattice
	    posterior -= backwardProbs[initial];

	    // fix some obvious numerical problems
	    if (posterior > LogP_One) posterior = LogP_One;

	    // determine ngram context for sucessor nodes
	    const VocabIndex *newContext;
	    if (ignoreWord(word)) {
		// context is unchanged
		newContext = context;
	    } else {
		// construct context extended by one word
		extendedContext[contextLength] = word;
		extendedContext[contextLength+1] = Vocab_None;

		// decide if extended context needs to be shortened
		if (contextLength < order - 1) {
		    newContext = extendedContext;
		} else {
		    newContext = extendedContext + 1;
		}
	    }

	    // accumulate forward probability for successor context
	    Boolean found;
	    LogP2 *nextProb =
			forwardProbMap.insert(nextIndex, newContext, found);
	    if (!found) {
		*nextProb = *forwardProb + transProb;
	    } else {
		*nextProb = AddLogP(*nextProb, *forwardProb + transProb);
	    }

	    // accumulate ngram counts, but only if the successor word
	    // is not one to be ignored.
	    // (note we can reuse the extended context buffer from above)
	    if (!ignoreWord(word)) {
		FloatCount count = LogPtoProb(posterior);

		for (VocabIndex *ngram = extendedContext;
		     ngram[0] != Vocab_None;
		     ngram ++)
		{
		    *counts.insertCount(ngram) += count;
		}

		countSum += count;
	    }
	}
    }

    // save space by deleting entries that are no longer used
    forwardProbMap.remove(oldIndex);

    return countSum;
}

FloatCount
Lattice::countNgrams(unsigned order, NgramCounts<FloatCount> &counts,
						    double posteriorScale)
{
    LogP2 *forwardProbs = new LogP2[maxIndex];
    LogP2 *backwardProbs = new LogP2[maxIndex];
    assert(forwardProbs != 0 && backwardProbs != 0);
    
    computeForwardBackward(forwardProbs, backwardProbs, posteriorScale);
    delete [] forwardProbs;		// not needed

    if (debug(DebugPrintFunctionality)) {
      dout() << "Lattice::countNgrams: "
 	     << "computing ngram counts\n";
    }

    unsigned numNodes = getNumNodes(); 

    NodeIndex *sortedNodes = new NodeIndex[numNodes];
    assert(sortedNodes != 0);
    unsigned numReachable = sortNodes(sortedNodes);

    if (numReachable != numNodes) {
      if (debug(DebugPrintOutLoop)) {
	dout() << "Lattice::countNgrams: warning: called with unreachable nodes\n";
      }
    }

    Map2<NodeIndex, VocabContext, LogP2> forwardProbMap;
    FloatCount countSum = 0.0;

    // prime forward probability map with initial node
    VocabIndex context[2];
    context[0] = vocab.ssIndex();
    context[1] = Vocab_None;
    *counts.insertCount(context) += 1.0;
    if (order > 1) {
	*forwardProbMap.insert(initial, context) = LogP_One;
    } else {
	// unigram counts required empty all nodes mapped to null context
	*forwardProbMap.insert(initial, &context[1]) = LogP_One;
    }

    for (unsigned i = 0; i < numReachable; i++) {
    	countSum +=
	    countNgramsAtNode(sortedNodes[i], order, counts,
			      backwardProbs, posteriorScale, forwardProbMap);
    }

    delete [] backwardProbs;

    return countSum; 
}

latticengrams.cc - 源码说明

本页面展示了「这是一款很好用的工具包」中的 latticengrams.cc 源码文件，采用 CC 编程语言编写，共 207 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与工具包相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?