📄 hidden-ngram.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/*
 * hidden-ngram --
 *	Recover hidden word-level events using a hidden n-gram model
 *	(derived from disambig.cc)
 *
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Id: hidden-ngram.cc,v 1.45 2006/01/05 20:21:27 stolcke Exp $";
#endif

#include <iostream>
using namespace std;
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
#include <assert.h>

#include "option.h"
#include "version.h"
#include "File.h"
#include "Vocab.h"
#include "VocabMap.h"
#include "SubVocab.h"
#include "NullLM.h"
#include "Ngram.h"
#include "ClassNgram.h"
#include "SimpleClassNgram.h"
#include "ProductNgram.h"
#include "BayesMix.h"
#include "NgramStats.cc"
#include "Trellis.cc"
#include "LHash.cc"
#include "Array.cc"

typedef const VocabIndex *VocabContext;

#define DEBUG_ZEROPROBS		1
#define DEBUG_TRANSITIONS	2
#define DEBUG_TRELLIS	        3

static int version = 0;
static unsigned numNbest = 1;
static unsigned order = 3;
static unsigned debug = 0;
static char *lmFile = 0;
static char *classesFile = 0;
static int simpleClasses = 0;
static int factored = 0;
static char *mixFile  = 0;
static char *mixFile2 = 0;
static char *mixFile3 = 0;
static char *mixFile4 = 0;
static char *mixFile5 = 0;
static char *mixFile6 = 0;
static char *mixFile7 = 0;
static char *mixFile8 = 0;
static char *mixFile9 = 0;
static double mixLambda = 0.5;
static double mixLambda2 = 0.0;
static double mixLambda3 = 0.0;
static double mixLambda4 = 0.0;
static double mixLambda5 = 0.0;
static double mixLambda6 = 0.0;
static double mixLambda7 = 0.0;
static double mixLambda8 = 0.0;
static double mixLambda9 = 0.0;
static char *hiddenVocabFile = 0;
static char *textFile = 0;
static char *textMapFile = 0;
static char *escape = 0;
static int keepUnk = 0;
static int keepnull = 1;
static int toLower = 0;
static double lmw = 1.0;
static double mapw = 1.0;
static int logMap = 0;
static int fb = 0;
static int fwOnly = 0;
static int posteriors = 0;
static int totals = 0;
static int continuous = 0;
static int forceEvent = 0;
static char *countsFile = 0;

typedef double NgramFractCount;			/* type for posterior counts */

const LogP LogP_PseudoZero = -100;

static double unkProb = LogP_PseudoZero;

const VocabString noHiddenEvent = "*noevent*";
VocabIndex noEventIndex;

static Option options[] = {
    { OPT_TRUE, "version", &version, "print version information" },
    { OPT_STRING, "lm", &lmFile, "hidden token sequence model" },
    { OPT_STRING, "classes", &classesFile, "class definitions" },
    { OPT_TRUE, "simple-classes", &simpleClasses, "use unique class model" },
    { OPT_TRUE, "factored", &factored, "use a factored LM" },
    { OPT_STRING, "mix-lm", &mixFile, "LM to mix in" },
    { OPT_FLOAT, "lambda", &mixLambda, "mixture weight for -lm" },
    { OPT_STRING, "mix-lm2", &mixFile2, "second LM to mix in" },
    { OPT_FLOAT, "mix-lambda2", &mixLambda2, "mixture weight for -mix-lm2" },
    { OPT_STRING, "mix-lm3", &mixFile3, "third LM to mix in" },
    { OPT_FLOAT, "mix-lambda3", &mixLambda3, "mixture weight for -mix-lm3" },
    { OPT_STRING, "mix-lm4", &mixFile4, "fourth LM to mix in" },
    { OPT_FLOAT, "mix-lambda4", &mixLambda4, "mixture weight for -mix-lm4" },
    { OPT_STRING, "mix-lm5", &mixFile5, "fifth LM to mix in" },
    { OPT_FLOAT, "mix-lambda5", &mixLambda5, "mixture weight for -mix-lm5" },
    { OPT_STRING, "mix-lm6", &mixFile6, "sixth LM to mix in" },
    { OPT_FLOAT, "mix-lambda6", &mixLambda6, "mixture weight for -mix-lm6" },
    { OPT_STRING, "mix-lm7", &mixFile7, "seventh LM to mix in" },
    { OPT_FLOAT, "mix-lambda7", &mixLambda7, "mixture weight for -mix-lm7" },
    { OPT_STRING, "mix-lm8", &mixFile8, "eighth LM to mix in" },
    { OPT_FLOAT, "mix-lambda8", &mixLambda8, "mixture weight for -mix-lm8" },
    { OPT_STRING, "mix-lm9", &mixFile9, "ninth LM to mix in" },
    { OPT_FLOAT, "mix-lambda9", &mixLambda9, "mixture weight for -mix-lm9" },
    { OPT_UINT, "nbest", &numNbest, "number of nbest hypotheses to generate in Viterbi search" },
    { OPT_UINT, "order", &order, "ngram order to use for lm" },
    { OPT_STRING, "hidden-vocab", &hiddenVocabFile, "hidden vocabulary" },
    { OPT_TRUE, "keep-unk", &keepUnk, "preserve unknown words" },
    { OPT_FALSE, "nonull", &keepnull, "remove <NULL> in factored LM" },
    { OPT_TRUE, "tolower", &toLower, "map vocabulary to lowercase" },
    { OPT_STRING, "text", &textFile, "text file to disambiguate" },
    { OPT_STRING, "text-map", &textMapFile, "text+map file to disambiguate" },
    { OPT_FLOAT, "lmw", &lmw, "language model weight" },
    { OPT_FLOAT, "mapw", &mapw, "map log likelihood weight" },
    { OPT_TRUE, "logmap", &logMap, "map file contains log probabilities" },
    { OPT_STRING, "escape", &escape, "escape prefix to pass data through -text" },
    { OPT_TRUE, "fb", &fb, "use forward-backward algorithm" },
    { OPT_TRUE, "fw-only", &fwOnly, "use only forward probabilities" },
    { OPT_TRUE, "posteriors", &posteriors, "output posterior event probabilities" },
    { OPT_TRUE, "totals", &totals, "output total string probabilities" },
    { OPT_TRUE, "continuous", &continuous, "read input without line breaks" },
    { OPT_FLOAT, "unk-prob", &unkProb, "log probability assigned to <unk>" },
    { OPT_TRUE, "force-event", &forceEvent, "disallow default event" },
    { OPT_STRING, "write-counts", &countsFile, "write posterior counts to file" },
    { OPT_UINT, "debug", &debug, "debugging level for lm" },
};

/*
 * Increment posterior counts for all prefixes of a context
 * Parameters:
 *		wid	current word
 *		context preceding N-gram
 *		prevWid	word preceding context (Vocab_None if not available)
 */
void
incrementCounts(NgramCounts<NgramFractCount> &counts, VocabIndex wid,
			    VocabIndex prevWid,
			    const VocabIndex *context, NgramFractCount factor)
{
    unsigned clen = Vocab::length(context);

    VocabIndex reversed[maxNgramOrder + 2];
    assert(clen < maxNgramOrder);

    unsigned i;
    reversed[0] = prevWid;
    for (i = 0; i < clen; i ++) {
	reversed[i + 1] = context[clen - 1 - i];

	/*
	 * Don't count contexts ending in *noevent*
	 */
	if (reversed[i + 1] == noEventIndex) {
		break;
	}
    }

    /*
     * Increment counts of context and its suffixes.
     * Note: this is skipped if last element of context is *noevent*.
     */
    if (i == clen) {
	reversed[i + 1] = Vocab_None;

	for (unsigned j = (prevWid == Vocab_None ? 1 : 0); j <= clen; j ++) {
	    *counts.insertCount(&reversed[j]) += factor;
	}
    }

    if (wid != Vocab_None) {
	reversed[i + 1] = wid;
	reversed[i + 2] = Vocab_None;

	for (unsigned j = 0; j <= clen; j ++) {
	    *counts.insertCount(&reversed[j + 1]) += factor;
	}
    }
}

/*
 * Find the word preceding context
 */
VocabIndex
findPrevWord(unsigned pos, const VocabIndex *wids, const VocabIndex *context)
{
    unsigned i;
    unsigned j;

    /*
     * Match words in context against wids starting at current position
     */
    for (i = pos, j = 0; context[j] != Vocab_None; j ++) {
	if (i > 0 && wids[i - 1] == context[j]) {
	    i --;
	}
    }

    if (i > 0 && j > 0 && context[j - 1] != wids[i]) {
	return wids[i - 1];
    } else {
	return Vocab_None;
    }
}

/*
 * Disambiguate a sentence by finding the hidden tag sequence compatible with
 * it that has the highest probability
 */
unsigned
disambiguateSentence(VocabIndex *wids, VocabIndex *hiddenWids[],
			LogP totalProb[], PosVocabMap &map, LM &lm,
			NgramCounts<NgramFractCount> *hiddenCounts,
			unsigned numNbest, Boolean positionMapped = false)
{

    unsigned len = Vocab::length(wids);

    Trellis<VocabContext> trellis(len + 1, numNbest);

    /*
     * Prime the trellis with empty context as the start state
     */
    static VocabIndex emptyContext[] = { Vocab_None };
    trellis.setProb(emptyContext, LogP_One);

    unsigned pos = 0;
    while (wids[pos] != Vocab_None) {

	trellis.step();

	/*
	 * Set up new context to transition to
	 * (allow enough room for one hidden event per word)
	 */
	VocabIndex newContext[2 * maxWordsPerLine + 3];

	/*
	 * Iterate over all contexts for the previous position in trellis
	 */
	TrellisIter<VocabContext> prevIter(trellis, pos);
	VocabContext prevContext;
	LogP prevProb;

	while (prevIter.next(prevContext, prevProb)) {

	    /*
	     * A non-event token in the previous context is skipped for
	     * purposes of LM lookup
	     */
	    VocabContext usedPrevContext =
					(prevContext[0] == noEventIndex) ?
						&prevContext[1] : prevContext;

	    /*
	     * transition prob out of previous context to current word;
	     * skip probability of first word, since it's a constant offset
	     * for all states and usually infinity if the first word is <s>
	     */
	    LogP wordProb = (pos == 0) ?
			LogP_One : lm.wordProb(wids[pos], usedPrevContext);

	    if (wordProb == LogP_Zero) {
		/*
		 * Zero probability due to an OOV
		 * would equalize the probabilities of all paths
		 * and make the Viterbi backtrace useless.
		 */
		wordProb = unkProb;
	    }

	    /*
	     * Set up the extended context.  Allow room for adding 
	     * the current word and a new hidden event.
	     */

	    unsigned i;
	    for (i = 0; i < 2 * maxWordsPerLine && 
				    usedPrevContext[i] != Vocab_None; i ++)
	    {
		newContext[i + 2] = usedPrevContext[i];
	    }

	    newContext[i + 2] = Vocab_None;
	    newContext[1] = wids[pos];	/* prepend current word */

	    /*
	     * Iterate over all hidden events
	     */
	    VocabMapIter currIter(map, positionMapped ? pos : 0);
	    VocabIndex currEvent;
	    Prob currProb;

	    while (currIter.next(currEvent, currProb)) {
		LogP localProb = logMap ? currProb : ProbToLogP(currProb);

		/*
		 * Prepend current event to context.
		 * Note: noEvent is represented here (but no earlier in 
		 * the context).
		 */ 
		newContext[0] = currEvent;	

		/*
		 * Event probability
		 */
		LogP eventProb = (currEvent == noEventIndex) ? LogP_One
				    : lm.wordProb(currEvent, &newContext[1]);

		/*
		 * Truncate context to what is actually used by LM,
		 * but keep at least one word so we can recover words later.
		 */
		VocabIndex *usedNewContext = 
			    (currEvent == noEventIndex) ? &newContext[1]
					: newContext;

		unsigned usedLength;
		lm.contextID(usedNewContext, usedLength);
		if (usedLength == 0) {
		    usedLength = 1;
		}

		VocabIndex truncatedContextWord = usedNewContext[usedLength];
		usedNewContext[usedLength] = Vocab_None;

		if (debug >= DEBUG_TRANSITIONS) {
		    cerr << "POSITION = " << pos
			 << " FROM: " << (lm.vocab.use(), prevContext)
			 << " TO: " << (lm.vocab.use(), newContext)
		         << " WORD = " << lm.vocab.getWord(wids[pos])
			 << " WORDPROB = " << wordProb
			 << " EVENTPROB = " << eventProb
			 << " LOCALPROB = " << localProb
			 << endl;
		}

		trellis.update(prevContext, newContext,
			       weightLogP(lmw, wordProb + eventProb) +
			       weightLogP(mapw, localProb));

		/*
		 * Restore newContext
		 */
		usedNewContext[usedLength] = truncatedContextWord;
	    }
	}

	pos ++;
    }

    if (debug >= DEBUG_TRELLIS) {
	cerr << "Trellis:\n" << trellis << endl;
    }

    /*
     * Check whether viterbi-nbest (default) is asked for.  If so,
     * do it and return. Otherwise do forward-backward.
     */
    if (!fb && !fwOnly && !posteriors && !hiddenCounts) {
	//VocabContext hiddenContexts[numNbest][len + 2];	  
	makeArray(VocabContext *, hiddenContexts, numNbest);

	unsigned n;
	for (n = 0; n < numNbest; n ++) {
	    hiddenContexts[n] = new VocabContext[len + 2];
						// One extra state for initial.
	    assert(hiddenContexts[n]);

	    if (trellis.nbest_viterbi(hiddenContexts[n], len + 1,
						    n, totalProb[n]) != len + 1)
	    {
		return n;
	    }
	      
	    /*
	     * Transfer the first word of each state (word context) into the
	     * word index string
	     */
	    for (unsigned i = 0; i < len; i ++) {
		hiddenWids[n][i] = hiddenContexts[n][i + 1][0];
	    }
	    hiddenWids[n][len] = Vocab_None;
	}

	for (n = 0; n < numNbest; n ++) {
	    delete [] hiddenContexts[n];
	}

	return numNbest;
    } else {
	/*
	 * Viterbi-nbest wasn't asked for.  We run the Forward-backward
	 * algorithm: compute backward scores, but only get the 1st best.
	 */
	trellis.initBack(pos);

	/* 
	 * Initialize backward probabilities
	 */
	{
12 3 下一页
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -