📄 hidden-ngram.cc
字号:
/*
* hidden-ngram --
* Recover hidden word-level events using a hidden n-gram model
* (derived from disambig.cc)
*
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International. All Rights Reserved.";
static char RcsId[] = "@(#)$Id: hidden-ngram.cc,v 1.45 2006/01/05 20:21:27 stolcke Exp $";
#endif
#include <iostream>
using namespace std;
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
#include <assert.h>
#include "option.h"
#include "version.h"
#include "File.h"
#include "Vocab.h"
#include "VocabMap.h"
#include "SubVocab.h"
#include "NullLM.h"
#include "Ngram.h"
#include "ClassNgram.h"
#include "SimpleClassNgram.h"
#include "ProductNgram.h"
#include "BayesMix.h"
#include "NgramStats.cc"
#include "Trellis.cc"
#include "LHash.cc"
#include "Array.cc"
typedef const VocabIndex *VocabContext;
#define DEBUG_ZEROPROBS 1
#define DEBUG_TRANSITIONS 2
#define DEBUG_TRELLIS 3
static int version = 0;
static unsigned numNbest = 1;
static unsigned order = 3;
static unsigned debug = 0;
static char *lmFile = 0;
static char *classesFile = 0;
static int simpleClasses = 0;
static int factored = 0;
static char *mixFile = 0;
static char *mixFile2 = 0;
static char *mixFile3 = 0;
static char *mixFile4 = 0;
static char *mixFile5 = 0;
static char *mixFile6 = 0;
static char *mixFile7 = 0;
static char *mixFile8 = 0;
static char *mixFile9 = 0;
static double mixLambda = 0.5;
static double mixLambda2 = 0.0;
static double mixLambda3 = 0.0;
static double mixLambda4 = 0.0;
static double mixLambda5 = 0.0;
static double mixLambda6 = 0.0;
static double mixLambda7 = 0.0;
static double mixLambda8 = 0.0;
static double mixLambda9 = 0.0;
static char *hiddenVocabFile = 0;
static char *textFile = 0;
static char *textMapFile = 0;
static char *escape = 0;
static int keepUnk = 0;
static int keepnull = 1;
static int toLower = 0;
static double lmw = 1.0;
static double mapw = 1.0;
static int logMap = 0;
static int fb = 0;
static int fwOnly = 0;
static int posteriors = 0;
static int totals = 0;
static int continuous = 0;
static int forceEvent = 0;
static char *countsFile = 0;
typedef double NgramFractCount; /* type for posterior counts */
const LogP LogP_PseudoZero = -100;
static double unkProb = LogP_PseudoZero;
const VocabString noHiddenEvent = "*noevent*";
VocabIndex noEventIndex;
static Option options[] = {
{ OPT_TRUE, "version", &version, "print version information" },
{ OPT_STRING, "lm", &lmFile, "hidden token sequence model" },
{ OPT_STRING, "classes", &classesFile, "class definitions" },
{ OPT_TRUE, "simple-classes", &simpleClasses, "use unique class model" },
{ OPT_TRUE, "factored", &factored, "use a factored LM" },
{ OPT_STRING, "mix-lm", &mixFile, "LM to mix in" },
{ OPT_FLOAT, "lambda", &mixLambda, "mixture weight for -lm" },
{ OPT_STRING, "mix-lm2", &mixFile2, "second LM to mix in" },
{ OPT_FLOAT, "mix-lambda2", &mixLambda2, "mixture weight for -mix-lm2" },
{ OPT_STRING, "mix-lm3", &mixFile3, "third LM to mix in" },
{ OPT_FLOAT, "mix-lambda3", &mixLambda3, "mixture weight for -mix-lm3" },
{ OPT_STRING, "mix-lm4", &mixFile4, "fourth LM to mix in" },
{ OPT_FLOAT, "mix-lambda4", &mixLambda4, "mixture weight for -mix-lm4" },
{ OPT_STRING, "mix-lm5", &mixFile5, "fifth LM to mix in" },
{ OPT_FLOAT, "mix-lambda5", &mixLambda5, "mixture weight for -mix-lm5" },
{ OPT_STRING, "mix-lm6", &mixFile6, "sixth LM to mix in" },
{ OPT_FLOAT, "mix-lambda6", &mixLambda6, "mixture weight for -mix-lm6" },
{ OPT_STRING, "mix-lm7", &mixFile7, "seventh LM to mix in" },
{ OPT_FLOAT, "mix-lambda7", &mixLambda7, "mixture weight for -mix-lm7" },
{ OPT_STRING, "mix-lm8", &mixFile8, "eighth LM to mix in" },
{ OPT_FLOAT, "mix-lambda8", &mixLambda8, "mixture weight for -mix-lm8" },
{ OPT_STRING, "mix-lm9", &mixFile9, "ninth LM to mix in" },
{ OPT_FLOAT, "mix-lambda9", &mixLambda9, "mixture weight for -mix-lm9" },
{ OPT_UINT, "nbest", &numNbest, "number of nbest hypotheses to generate in Viterbi search" },
{ OPT_UINT, "order", &order, "ngram order to use for lm" },
{ OPT_STRING, "hidden-vocab", &hiddenVocabFile, "hidden vocabulary" },
{ OPT_TRUE, "keep-unk", &keepUnk, "preserve unknown words" },
{ OPT_FALSE, "nonull", &keepnull, "remove <NULL> in factored LM" },
{ OPT_TRUE, "tolower", &toLower, "map vocabulary to lowercase" },
{ OPT_STRING, "text", &textFile, "text file to disambiguate" },
{ OPT_STRING, "text-map", &textMapFile, "text+map file to disambiguate" },
{ OPT_FLOAT, "lmw", &lmw, "language model weight" },
{ OPT_FLOAT, "mapw", &mapw, "map log likelihood weight" },
{ OPT_TRUE, "logmap", &logMap, "map file contains log probabilities" },
{ OPT_STRING, "escape", &escape, "escape prefix to pass data through -text" },
{ OPT_TRUE, "fb", &fb, "use forward-backward algorithm" },
{ OPT_TRUE, "fw-only", &fwOnly, "use only forward probabilities" },
{ OPT_TRUE, "posteriors", &posteriors, "output posterior event probabilities" },
{ OPT_TRUE, "totals", &totals, "output total string probabilities" },
{ OPT_TRUE, "continuous", &continuous, "read input without line breaks" },
{ OPT_FLOAT, "unk-prob", &unkProb, "log probability assigned to <unk>" },
{ OPT_TRUE, "force-event", &forceEvent, "disallow default event" },
{ OPT_STRING, "write-counts", &countsFile, "write posterior counts to file" },
{ OPT_UINT, "debug", &debug, "debugging level for lm" },
};
/*
* Increment posterior counts for all prefixes of a context
* Parameters:
* wid current word
* context preceding N-gram
* prevWid word preceding context (Vocab_None if not available)
*/
void
incrementCounts(NgramCounts<NgramFractCount> &counts, VocabIndex wid,
VocabIndex prevWid,
const VocabIndex *context, NgramFractCount factor)
{
unsigned clen = Vocab::length(context);
VocabIndex reversed[maxNgramOrder + 2];
assert(clen < maxNgramOrder);
unsigned i;
reversed[0] = prevWid;
for (i = 0; i < clen; i ++) {
reversed[i + 1] = context[clen - 1 - i];
/*
* Don't count contexts ending in *noevent*
*/
if (reversed[i + 1] == noEventIndex) {
break;
}
}
/*
* Increment counts of context and its suffixes.
* Note: this is skipped if last element of context is *noevent*.
*/
if (i == clen) {
reversed[i + 1] = Vocab_None;
for (unsigned j = (prevWid == Vocab_None ? 1 : 0); j <= clen; j ++) {
*counts.insertCount(&reversed[j]) += factor;
}
}
if (wid != Vocab_None) {
reversed[i + 1] = wid;
reversed[i + 2] = Vocab_None;
for (unsigned j = 0; j <= clen; j ++) {
*counts.insertCount(&reversed[j + 1]) += factor;
}
}
}
/*
* Find the word preceding context
*/
VocabIndex
findPrevWord(unsigned pos, const VocabIndex *wids, const VocabIndex *context)
{
unsigned i;
unsigned j;
/*
* Match words in context against wids starting at current position
*/
for (i = pos, j = 0; context[j] != Vocab_None; j ++) {
if (i > 0 && wids[i - 1] == context[j]) {
i --;
}
}
if (i > 0 && j > 0 && context[j - 1] != wids[i]) {
return wids[i - 1];
} else {
return Vocab_None;
}
}
/*
* Disambiguate a sentence by finding the hidden tag sequence compatible with
* it that has the highest probability
*/
unsigned
disambiguateSentence(VocabIndex *wids, VocabIndex *hiddenWids[],
LogP totalProb[], PosVocabMap &map, LM &lm,
NgramCounts<NgramFractCount> *hiddenCounts,
unsigned numNbest, Boolean positionMapped = false)
{
unsigned len = Vocab::length(wids);
Trellis<VocabContext> trellis(len + 1, numNbest);
/*
* Prime the trellis with empty context as the start state
*/
static VocabIndex emptyContext[] = { Vocab_None };
trellis.setProb(emptyContext, LogP_One);
unsigned pos = 0;
while (wids[pos] != Vocab_None) {
trellis.step();
/*
* Set up new context to transition to
* (allow enough room for one hidden event per word)
*/
VocabIndex newContext[2 * maxWordsPerLine + 3];
/*
* Iterate over all contexts for the previous position in trellis
*/
TrellisIter<VocabContext> prevIter(trellis, pos);
VocabContext prevContext;
LogP prevProb;
while (prevIter.next(prevContext, prevProb)) {
/*
* A non-event token in the previous context is skipped for
* purposes of LM lookup
*/
VocabContext usedPrevContext =
(prevContext[0] == noEventIndex) ?
&prevContext[1] : prevContext;
/*
* transition prob out of previous context to current word;
* skip probability of first word, since it's a constant offset
* for all states and usually infinity if the first word is <s>
*/
LogP wordProb = (pos == 0) ?
LogP_One : lm.wordProb(wids[pos], usedPrevContext);
if (wordProb == LogP_Zero) {
/*
* Zero probability due to an OOV
* would equalize the probabilities of all paths
* and make the Viterbi backtrace useless.
*/
wordProb = unkProb;
}
/*
* Set up the extended context. Allow room for adding
* the current word and a new hidden event.
*/
unsigned i;
for (i = 0; i < 2 * maxWordsPerLine &&
usedPrevContext[i] != Vocab_None; i ++)
{
newContext[i + 2] = usedPrevContext[i];
}
newContext[i + 2] = Vocab_None;
newContext[1] = wids[pos]; /* prepend current word */
/*
* Iterate over all hidden events
*/
VocabMapIter currIter(map, positionMapped ? pos : 0);
VocabIndex currEvent;
Prob currProb;
while (currIter.next(currEvent, currProb)) {
LogP localProb = logMap ? currProb : ProbToLogP(currProb);
/*
* Prepend current event to context.
* Note: noEvent is represented here (but no earlier in
* the context).
*/
newContext[0] = currEvent;
/*
* Event probability
*/
LogP eventProb = (currEvent == noEventIndex) ? LogP_One
: lm.wordProb(currEvent, &newContext[1]);
/*
* Truncate context to what is actually used by LM,
* but keep at least one word so we can recover words later.
*/
VocabIndex *usedNewContext =
(currEvent == noEventIndex) ? &newContext[1]
: newContext;
unsigned usedLength;
lm.contextID(usedNewContext, usedLength);
if (usedLength == 0) {
usedLength = 1;
}
VocabIndex truncatedContextWord = usedNewContext[usedLength];
usedNewContext[usedLength] = Vocab_None;
if (debug >= DEBUG_TRANSITIONS) {
cerr << "POSITION = " << pos
<< " FROM: " << (lm.vocab.use(), prevContext)
<< " TO: " << (lm.vocab.use(), newContext)
<< " WORD = " << lm.vocab.getWord(wids[pos])
<< " WORDPROB = " << wordProb
<< " EVENTPROB = " << eventProb
<< " LOCALPROB = " << localProb
<< endl;
}
trellis.update(prevContext, newContext,
weightLogP(lmw, wordProb + eventProb) +
weightLogP(mapw, localProb));
/*
* Restore newContext
*/
usedNewContext[usedLength] = truncatedContextWord;
}
}
pos ++;
}
if (debug >= DEBUG_TRELLIS) {
cerr << "Trellis:\n" << trellis << endl;
}
/*
* Check whether viterbi-nbest (default) is asked for. If so,
* do it and return. Otherwise do forward-backward.
*/
if (!fb && !fwOnly && !posteriors && !hiddenCounts) {
//VocabContext hiddenContexts[numNbest][len + 2];
makeArray(VocabContext *, hiddenContexts, numNbest);
unsigned n;
for (n = 0; n < numNbest; n ++) {
hiddenContexts[n] = new VocabContext[len + 2];
// One extra state for initial.
assert(hiddenContexts[n]);
if (trellis.nbest_viterbi(hiddenContexts[n], len + 1,
n, totalProb[n]) != len + 1)
{
return n;
}
/*
* Transfer the first word of each state (word context) into the
* word index string
*/
for (unsigned i = 0; i < len; i ++) {
hiddenWids[n][i] = hiddenContexts[n][i + 1][0];
}
hiddenWids[n][len] = Vocab_None;
}
for (n = 0; n < numNbest; n ++) {
delete [] hiddenContexts[n];
}
return numNbest;
} else {
/*
* Viterbi-nbest wasn't asked for. We run the Forward-backward
* algorithm: compute backward scores, but only get the 1st best.
*/
trellis.initBack(pos);
/*
* Initialize backward probabilities
*/
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -