disambig.cc

来自「这是一款很好用的工具包」· CC 代码 · 共 850 行 · 第 1/2 页
850 行
/*
 * disambig --
 *	Disambiguate text tokens using a hidden ngram model
 *
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Id: disambig.cc,v 1.40 2006/01/05 20:21:27 stolcke Exp $";
#endif

#include <iostream>
using namespace std;
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>

#include "option.h"
#include "version.h"
#include "File.h"
#include "Vocab.h"
#include "VocabMap.h"
#include "NullLM.h"
#include "Ngram.h"
#include "Trellis.cc"
#include "LHash.cc"
#include "Array.cc"

typedef const VocabIndex *VocabContext;

#define DEBUG_ZEROPROBS		1
#define DEBUG_TRANSITIONS	2
#define DEBUG_TRELLIS	        3

static int version = 0;
static unsigned numNbest = 1;
static unsigned order = 2;
static unsigned debug = 0;
static int scale = 0;
static char *lmFile = 0;
static char *vocab1File = 0;
static char *vocab2File = 0;
static char *mapFile = 0;
static char *classesFile = 0;
static char *mapWriteFile = 0;
static char *textFile = 0;
static char *textMapFile = 0;
static char *countsFile = 0;
static int keepUnk = 0;
static int tolower1 = 0;
static int tolower2 = 0;
static double lmw = 1.0;
static double mapw = 1.0;
static int fb = 0;
static int fwOnly = 0;
static int posteriors = 0;
static int totals = 0;
static int logMap = 0;
static int noEOS = 0;
static int continuous = 0;

const LogP LogP_PseudoZero = -100;

static Option options[] = {
    { OPT_TRUE, "version", &version, "print version information" },
    { OPT_STRING, "lm", &lmFile, "hidden token sequence model" },
    { OPT_UINT, "nbest", &numNbest, "number of nbest hypotheses to generate in Viterbi search" },
    { OPT_UINT, "order", &order, "ngram order to use for lm" },
    { OPT_STRING, "write-vocab1", &vocab1File, "output observable vocabulary" },
    { OPT_STRING, "write-vocab2", &vocab2File, "output hidden vocabulary" },
    { OPT_STRING, "map", &mapFile, "mapping from observable to hidden tokens" },
    { OPT_STRING, "classes", &classesFile, "mapping in class expansion format" },
    { OPT_TRUE, "logmap", &logMap, "map file contains log probabilities" },
    { OPT_STRING, "write-map", &mapWriteFile, "output map file (for validation)" },
    { OPT_STRING, "write-counts", &countsFile, "output substitution counts" },
    { OPT_TRUE, "scale", &scale, "scale map probabilities by unigram probs" },
    { OPT_TRUE, "keep-unk", &keepUnk, "preserve unknown words" },
    { OPT_TRUE, "tolower1", &tolower1, "map observable vocabulary to lowercase" },
    { OPT_TRUE, "tolower2", &tolower2, "map hidden vocabulary to lowercase" },
    { OPT_STRING, "text", &textFile, "text file to disambiguate" },
    { OPT_STRING, "text-map", &textMapFile, "text+map file to disambiguate" },
    { OPT_FLOAT, "lmw", &lmw, "language model weight" },
    { OPT_FLOAT, "mapw", &mapw, "map log likelihood weight" },
    { OPT_TRUE, "fb", &fb, "use forward-backward algorithm" },
    { OPT_TRUE, "fwOnly", &fwOnly, "use forward probabilities only" },
    { OPT_TRUE, "posteriors", &posteriors, "output posterior probabilities" },
    { OPT_TRUE, "totals", &totals, "output total string probabilities" },
    { OPT_TRUE, "no-eos", &noEOS, "don't assume end-of-sentence token" },
    { OPT_TRUE, "continuous", &continuous, "read input without line breaks" },
    { OPT_UINT, "debug", &debug, "debugging level for lm" },
};

/*
 * Disambiguate a sentence by finding the hidden tag sequence compatible with
 * it that has the highest probability
 */
unsigned
disambiguateSentence(Vocab &vocab, VocabIndex *wids, VocabIndex *hiddenWids[],
		     LogP totalProb[], VocabMap &map, LM &lm, VocabMap *counts,
		     unsigned numNbest, Boolean positionMapped = false)
{
    static VocabIndex emptyContext[] = { Vocab_None };
    unsigned len = Vocab::length(wids);
    assert(len > 0);

    Trellis<VocabContext> trellis(len, numNbest);

    /*
     * Prime the trellis with the tag likelihoods of the first position
     * (No transition probs for this one.)
     */
    {
	VocabMapIter iter(map, positionMapped ? (VocabIndex)0 : wids[0]);

	VocabIndex context[2];
	Prob prob;

	context[1] = Vocab_None;
	while (iter.next(context[0], prob)) {
	    LogP localProb = logMap ? prob : ProbToLogP(prob);
	    trellis.setProb(context, localProb);
	}
    }

    unsigned pos = 1;
    while (wids[pos] != Vocab_None) {

	trellis.step();

	/*
	 * Iterate over all combinations of hidden tags for the current
	 * position and contexts for the previous position.
	 */
	VocabMapIter currIter(map,
				positionMapped ? (VocabIndex)pos : wids[pos]);
	VocabIndex currWid;
	Prob currProb;

	while (currIter.next(currWid, currProb)) {
	    LogP localProb = logMap ? currProb : ProbToLogP(currProb);
	    LogP unigramProb = lm.wordProb(currWid, emptyContext);
	    
	    if (debug >= DEBUG_ZEROPROBS &&
		unigramProb == LogP_Zero &&
		currWid != map.vocab2.unkIndex())
	    {
		cerr << "warning: map token " << map.vocab2.getWord(currWid)
		     << " has prob zero\n";
	    }

	    if (scale && unigramProb != LogP_Zero) {
		/*
		 * Scale the map probability p(tag|word)
		 * by the tag prior probability p(tag), so the
		 * local distance is a scaled version of the
		 * likelihood p(word|tag)
		 */
		localProb -= unigramProb;
	    }

	    /*
	     * Set up new context to transition to
	     */
	    VocabIndex newContext[maxWordsPerLine + 2];
	    newContext[0] = currWid;

	    TrellisIter<VocabContext> prevIter(trellis, pos - 1);
	    VocabContext prevContext;
	    LogP prevProb;

	    while (prevIter.next(prevContext, prevProb)) {
		LogP transProb = lm.wordProb(currWid, prevContext);

		if (transProb == LogP_Zero && unigramProb == LogP_Zero) {
		    /*
		     * Zero probability due to an OOV
		     * would equalize the probabilities of all paths
		     * and make the Viterbi backtrace useless.
		     */
		    transProb = LogP_PseudoZero;
		}

		/*
		 * Determined new context
		 */
		unsigned i;
		for (i = 0; i < maxWordsPerLine && 
					prevContext[i] != Vocab_None; i ++)
		{
		    newContext[i + 1] = prevContext[i];
		}
		newContext[i + 1] = Vocab_None;

		/*
		 * Truncate context to what is actually used by LM,
		 * but keep at least one word so we can recover words later.
		 */
		unsigned usedLength;
		lm.contextID(newContext, usedLength);
		newContext[usedLength > 0 ? usedLength : 1] = Vocab_None;

		if (debug >= DEBUG_TRANSITIONS) {
		    cerr << "position = " << pos
			 << " from: " << (map.vocab2.use(), prevContext)
			 << " to: " << (map.vocab2.use(), newContext)
			 << " trans = " << transProb
			 << " local = " << localProb
			 << endl;
		}

		trellis.update(prevContext, newContext,
			       weightLogP(lmw, transProb) +
			       weightLogP(mapw, localProb));
	    }
	}

	pos ++;
    }

    if (debug >= DEBUG_TRELLIS) {
	cerr << "Trellis:\n" << trellis << endl;
    }

    /*
     * Check whether viterbi-nbest (default) is asked for.  If so,
     * do it and return. Otherwise do forward-backward.
     */
    if (!fb && !fwOnly && !posteriors) {
	//VocabContext hiddenContexts[numNbest][len + 1];
	makeArray(VocabContext *, hiddenContexts, numNbest);

	for (unsigned n = 0; n < numNbest; n++) {
	    hiddenContexts[n] = new VocabContext[len + 1];
	    assert(hiddenContexts[n] != 0);

	    if (trellis.nbest_viterbi(hiddenContexts[n], len,
						n, totalProb[n]) != len)
	    {
		return n;
	    }
	      
	    /*
	     * Transfer the first word of each state (word context) into the
	     * word index string
	     */
	    for (unsigned i = 0; i < len; i ++) {
		hiddenWids[n][i] = hiddenContexts[n][i][0];
	    }
	    hiddenWids[n][len] = Vocab_None;
	}

	/* 
	 * update v1-v2 counts if requested 
	 */
	if (counts) {
	    for (unsigned i = 0; i < len; i++) {
		counts->put(wids[i], hiddenWids[0][i],
			    counts->get(wids[i], hiddenWids[0][i]) + 1);
	    }
	}

	for (unsigned i = 0; i < numNbest; i++) {
	    delete [] hiddenContexts[i];
	}

	return numNbest;
    } else {
	/*
	 * Viterbi-nbest wasn't asked for.  We run the Forward-backward
	 * algorithm: compute backward scores, but only get the 1st best.
	 */
	pos --;
	trellis.initBack(pos);

	/* 
	 * Initialize backward probabilities
	 */
	{
	    TrellisIter<VocabContext> iter(trellis, pos);
	    VocabContext context;
	    LogP prob;
	    while (iter.next(context, prob)) {
		trellis.setBackProb(context, LogP_One);
	    }
	}

	while (pos > 0) {
	    trellis.stepBack();

	    /*
	     * Iterate over all combinations of hidden tags for the current
	     * position and contexts for the previous position.
	     */
	    VocabMapIter currIter(map,
				positionMapped ? (VocabIndex)pos : wids[pos]);
	    VocabIndex currWid;
	    Prob currProb;

	    while (currIter.next(currWid, currProb)) {
		LogP localProb = logMap ? currProb : ProbToLogP(currProb);
		LogP unigramProb = lm.wordProb(currWid, emptyContext);
		
		if (debug >= DEBUG_ZEROPROBS &&
		    unigramProb == LogP_Zero &&
		    currWid != map.vocab2.unkIndex())
		{
		    cerr << "warning: map token " << map.vocab2.getWord(currWid)
			 << " has prob zero\n";
		}

		if (scale && unigramProb != LogP_Zero) {
		    /*
		     * Scale the map probability p(tag|word)
		     * by the tag prior probability p(tag), so the
		     * local distance is a scaled version of the
		     * likelihood p(word|tag)
		     */
		    localProb -= unigramProb;
		}

		/*
		 * Set up new context to transition to
		 */
		VocabIndex newContext[maxWordsPerLine + 2];
		newContext[0] = currWid;

		TrellisIter<VocabContext> prevIter(trellis, pos - 1);
		VocabContext prevContext;
		LogP prevProb;

		while (prevIter.next(prevContext, prevProb)) {
		    LogP transProb = lm.wordProb(currWid, prevContext);

		    if (transProb == LogP_Zero && unigramProb == LogP_Zero) {
			/*
			 * Zero probability due to an OOV
			 * would equalize the probabilities of all paths
			 * and make the Viterbi backtrace useless.
			 */
			transProb = LogP_PseudoZero;
		    }

		    /*
		     * Determined new context
		     */
		    unsigned i;
		    for (i = 0; i < maxWordsPerLine && 
					    prevContext[i] != Vocab_None; i ++)
		    {
			newContext[i + 1] = prevContext[i];
		    }
		    newContext[i + 1] = Vocab_None;

		    /*
		     * Truncate context to what is actually used by LM,
		     * but keep at least one word so we can recover words later.
		     */
		    unsigned usedLength;
		    lm.contextID(newContext, usedLength);
		    newContext[usedLength > 0 ? usedLength : 1] = Vocab_None;

		    trellis.updateBack(prevContext, newContext,
				       weightLogP(lmw, transProb) +
				       weightLogP(mapw, localProb));

		    if (debug >= DEBUG_TRANSITIONS) {
			cerr << "backward position = " << pos
			     << " from: " << (map.vocab2.use(), prevContext)
			     << " to: " << (map.vocab2.use(), newContext)
			     << " trans = " << transProb
			     << " local = " << localProb
			     << endl;
		    }
		}
	    }
	    pos --;
	}

	/*
	 * Compute posterior symbol probabilities and extract most
	 * probable symbol for each position
	 */
	for (pos = 0; pos < len; pos ++) {
	    /*
	     * Compute symbol probabilities by summing posterior probs
	     * of all states corresponding to the same symbol
	     */
	    LHash<VocabIndex,LogP2> symbolProbs;

	    TrellisIter<VocabContext> iter(trellis, pos);
	    VocabContext context;
	    LogP forwProb;
	    while (iter.next(context, forwProb)) {
		LogP2 posterior;
		
		if (fwOnly) {
		    posterior = forwProb;
		} else if (fb) {
		    posterior = forwProb + trellis.getBackLogP(context, pos);
		} else {
		    LogP backmax;
		    posterior = trellis.getMax(context, pos, backmax);
		    posterior += backmax;
		}

		Boolean foundP;
		LogP2 *symbolProb = symbolProbs.insert(context[0], foundP);
		if (!foundP) {
		    *symbolProb = posterior;
		} else {
		    *symbolProb = AddLogP(*symbolProb, posterior);
		}
	    }

	    /*
	     * Find symbol with highest posterior
	     */
	    LogP2 totalPosterior = LogP_Zero;
	    LogP2 maxPosterior = LogP_Zero;
	    VocabIndex bestSymbol = Vocab_None;

	    LHashIter<VocabIndex,LogP2> symbolIter(symbolProbs);
	    LogP2 *symbolProb;
	    VocabIndex symbol;
	    while (symbolProb = symbolIter.next(symbol)) {
disambig.cc - 源码说明

本页面展示了「这是一款很好用的工具包」中的 disambig.cc 源码文件，采用 CC 编程语言编写，共 850 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与工具包相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?