📄 disambig.cc
字号:
/*
* disambig --
* Disambiguate text tokens using a hidden ngram model
*
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International. All Rights Reserved.";
static char RcsId[] = "@(#)$Id: disambig.cc,v 1.40 2006/01/05 20:21:27 stolcke Exp $";
#endif
#include <iostream>
using namespace std;
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
#include "option.h"
#include "version.h"
#include "File.h"
#include "Vocab.h"
#include "VocabMap.h"
#include "NullLM.h"
#include "Ngram.h"
#include "Trellis.cc"
#include "LHash.cc"
#include "Array.cc"
typedef const VocabIndex *VocabContext;
#define DEBUG_ZEROPROBS 1
#define DEBUG_TRANSITIONS 2
#define DEBUG_TRELLIS 3
static int version = 0;
static unsigned numNbest = 1;
static unsigned order = 2;
static unsigned debug = 0;
static int scale = 0;
static char *lmFile = 0;
static char *vocab1File = 0;
static char *vocab2File = 0;
static char *mapFile = 0;
static char *classesFile = 0;
static char *mapWriteFile = 0;
static char *textFile = 0;
static char *textMapFile = 0;
static char *countsFile = 0;
static int keepUnk = 0;
static int tolower1 = 0;
static int tolower2 = 0;
static double lmw = 1.0;
static double mapw = 1.0;
static int fb = 0;
static int fwOnly = 0;
static int posteriors = 0;
static int totals = 0;
static int logMap = 0;
static int noEOS = 0;
static int continuous = 0;
const LogP LogP_PseudoZero = -100;
static Option options[] = {
{ OPT_TRUE, "version", &version, "print version information" },
{ OPT_STRING, "lm", &lmFile, "hidden token sequence model" },
{ OPT_UINT, "nbest", &numNbest, "number of nbest hypotheses to generate in Viterbi search" },
{ OPT_UINT, "order", &order, "ngram order to use for lm" },
{ OPT_STRING, "write-vocab1", &vocab1File, "output observable vocabulary" },
{ OPT_STRING, "write-vocab2", &vocab2File, "output hidden vocabulary" },
{ OPT_STRING, "map", &mapFile, "mapping from observable to hidden tokens" },
{ OPT_STRING, "classes", &classesFile, "mapping in class expansion format" },
{ OPT_TRUE, "logmap", &logMap, "map file contains log probabilities" },
{ OPT_STRING, "write-map", &mapWriteFile, "output map file (for validation)" },
{ OPT_STRING, "write-counts", &countsFile, "output substitution counts" },
{ OPT_TRUE, "scale", &scale, "scale map probabilities by unigram probs" },
{ OPT_TRUE, "keep-unk", &keepUnk, "preserve unknown words" },
{ OPT_TRUE, "tolower1", &tolower1, "map observable vocabulary to lowercase" },
{ OPT_TRUE, "tolower2", &tolower2, "map hidden vocabulary to lowercase" },
{ OPT_STRING, "text", &textFile, "text file to disambiguate" },
{ OPT_STRING, "text-map", &textMapFile, "text+map file to disambiguate" },
{ OPT_FLOAT, "lmw", &lmw, "language model weight" },
{ OPT_FLOAT, "mapw", &mapw, "map log likelihood weight" },
{ OPT_TRUE, "fb", &fb, "use forward-backward algorithm" },
{ OPT_TRUE, "fwOnly", &fwOnly, "use forward probabilities only" },
{ OPT_TRUE, "posteriors", &posteriors, "output posterior probabilities" },
{ OPT_TRUE, "totals", &totals, "output total string probabilities" },
{ OPT_TRUE, "no-eos", &noEOS, "don't assume end-of-sentence token" },
{ OPT_TRUE, "continuous", &continuous, "read input without line breaks" },
{ OPT_UINT, "debug", &debug, "debugging level for lm" },
};
/*
* Disambiguate a sentence by finding the hidden tag sequence compatible with
* it that has the highest probability
*/
unsigned
disambiguateSentence(Vocab &vocab, VocabIndex *wids, VocabIndex *hiddenWids[],
LogP totalProb[], VocabMap &map, LM &lm, VocabMap *counts,
unsigned numNbest, Boolean positionMapped = false)
{
static VocabIndex emptyContext[] = { Vocab_None };
unsigned len = Vocab::length(wids);
assert(len > 0);
Trellis<VocabContext> trellis(len, numNbest);
/*
* Prime the trellis with the tag likelihoods of the first position
* (No transition probs for this one.)
*/
{
VocabMapIter iter(map, positionMapped ? (VocabIndex)0 : wids[0]);
VocabIndex context[2];
Prob prob;
context[1] = Vocab_None;
while (iter.next(context[0], prob)) {
LogP localProb = logMap ? prob : ProbToLogP(prob);
trellis.setProb(context, localProb);
}
}
unsigned pos = 1;
while (wids[pos] != Vocab_None) {
trellis.step();
/*
* Iterate over all combinations of hidden tags for the current
* position and contexts for the previous position.
*/
VocabMapIter currIter(map,
positionMapped ? (VocabIndex)pos : wids[pos]);
VocabIndex currWid;
Prob currProb;
while (currIter.next(currWid, currProb)) {
LogP localProb = logMap ? currProb : ProbToLogP(currProb);
LogP unigramProb = lm.wordProb(currWid, emptyContext);
if (debug >= DEBUG_ZEROPROBS &&
unigramProb == LogP_Zero &&
currWid != map.vocab2.unkIndex())
{
cerr << "warning: map token " << map.vocab2.getWord(currWid)
<< " has prob zero\n";
}
if (scale && unigramProb != LogP_Zero) {
/*
* Scale the map probability p(tag|word)
* by the tag prior probability p(tag), so the
* local distance is a scaled version of the
* likelihood p(word|tag)
*/
localProb -= unigramProb;
}
/*
* Set up new context to transition to
*/
VocabIndex newContext[maxWordsPerLine + 2];
newContext[0] = currWid;
TrellisIter<VocabContext> prevIter(trellis, pos - 1);
VocabContext prevContext;
LogP prevProb;
while (prevIter.next(prevContext, prevProb)) {
LogP transProb = lm.wordProb(currWid, prevContext);
if (transProb == LogP_Zero && unigramProb == LogP_Zero) {
/*
* Zero probability due to an OOV
* would equalize the probabilities of all paths
* and make the Viterbi backtrace useless.
*/
transProb = LogP_PseudoZero;
}
/*
* Determined new context
*/
unsigned i;
for (i = 0; i < maxWordsPerLine &&
prevContext[i] != Vocab_None; i ++)
{
newContext[i + 1] = prevContext[i];
}
newContext[i + 1] = Vocab_None;
/*
* Truncate context to what is actually used by LM,
* but keep at least one word so we can recover words later.
*/
unsigned usedLength;
lm.contextID(newContext, usedLength);
newContext[usedLength > 0 ? usedLength : 1] = Vocab_None;
if (debug >= DEBUG_TRANSITIONS) {
cerr << "position = " << pos
<< " from: " << (map.vocab2.use(), prevContext)
<< " to: " << (map.vocab2.use(), newContext)
<< " trans = " << transProb
<< " local = " << localProb
<< endl;
}
trellis.update(prevContext, newContext,
weightLogP(lmw, transProb) +
weightLogP(mapw, localProb));
}
}
pos ++;
}
if (debug >= DEBUG_TRELLIS) {
cerr << "Trellis:\n" << trellis << endl;
}
/*
* Check whether viterbi-nbest (default) is asked for. If so,
* do it and return. Otherwise do forward-backward.
*/
if (!fb && !fwOnly && !posteriors) {
//VocabContext hiddenContexts[numNbest][len + 1];
makeArray(VocabContext *, hiddenContexts, numNbest);
for (unsigned n = 0; n < numNbest; n++) {
hiddenContexts[n] = new VocabContext[len + 1];
assert(hiddenContexts[n] != 0);
if (trellis.nbest_viterbi(hiddenContexts[n], len,
n, totalProb[n]) != len)
{
return n;
}
/*
* Transfer the first word of each state (word context) into the
* word index string
*/
for (unsigned i = 0; i < len; i ++) {
hiddenWids[n][i] = hiddenContexts[n][i][0];
}
hiddenWids[n][len] = Vocab_None;
}
/*
* update v1-v2 counts if requested
*/
if (counts) {
for (unsigned i = 0; i < len; i++) {
counts->put(wids[i], hiddenWids[0][i],
counts->get(wids[i], hiddenWids[0][i]) + 1);
}
}
for (unsigned i = 0; i < numNbest; i++) {
delete [] hiddenContexts[i];
}
return numNbest;
} else {
/*
* Viterbi-nbest wasn't asked for. We run the Forward-backward
* algorithm: compute backward scores, but only get the 1st best.
*/
pos --;
trellis.initBack(pos);
/*
* Initialize backward probabilities
*/
{
TrellisIter<VocabContext> iter(trellis, pos);
VocabContext context;
LogP prob;
while (iter.next(context, prob)) {
trellis.setBackProb(context, LogP_One);
}
}
while (pos > 0) {
trellis.stepBack();
/*
* Iterate over all combinations of hidden tags for the current
* position and contexts for the previous position.
*/
VocabMapIter currIter(map,
positionMapped ? (VocabIndex)pos : wids[pos]);
VocabIndex currWid;
Prob currProb;
while (currIter.next(currWid, currProb)) {
LogP localProb = logMap ? currProb : ProbToLogP(currProb);
LogP unigramProb = lm.wordProb(currWid, emptyContext);
if (debug >= DEBUG_ZEROPROBS &&
unigramProb == LogP_Zero &&
currWid != map.vocab2.unkIndex())
{
cerr << "warning: map token " << map.vocab2.getWord(currWid)
<< " has prob zero\n";
}
if (scale && unigramProb != LogP_Zero) {
/*
* Scale the map probability p(tag|word)
* by the tag prior probability p(tag), so the
* local distance is a scaled version of the
* likelihood p(word|tag)
*/
localProb -= unigramProb;
}
/*
* Set up new context to transition to
*/
VocabIndex newContext[maxWordsPerLine + 2];
newContext[0] = currWid;
TrellisIter<VocabContext> prevIter(trellis, pos - 1);
VocabContext prevContext;
LogP prevProb;
while (prevIter.next(prevContext, prevProb)) {
LogP transProb = lm.wordProb(currWid, prevContext);
if (transProb == LogP_Zero && unigramProb == LogP_Zero) {
/*
* Zero probability due to an OOV
* would equalize the probabilities of all paths
* and make the Viterbi backtrace useless.
*/
transProb = LogP_PseudoZero;
}
/*
* Determined new context
*/
unsigned i;
for (i = 0; i < maxWordsPerLine &&
prevContext[i] != Vocab_None; i ++)
{
newContext[i + 1] = prevContext[i];
}
newContext[i + 1] = Vocab_None;
/*
* Truncate context to what is actually used by LM,
* but keep at least one word so we can recover words later.
*/
unsigned usedLength;
lm.contextID(newContext, usedLength);
newContext[usedLength > 0 ? usedLength : 1] = Vocab_None;
trellis.updateBack(prevContext, newContext,
weightLogP(lmw, transProb) +
weightLogP(mapw, localProb));
if (debug >= DEBUG_TRANSITIONS) {
cerr << "backward position = " << pos
<< " from: " << (map.vocab2.use(), prevContext)
<< " to: " << (map.vocab2.use(), newContext)
<< " trans = " << transProb
<< " local = " << localProb
<< endl;
}
}
}
pos --;
}
/*
* Compute posterior symbol probabilities and extract most
* probable symbol for each position
*/
for (pos = 0; pos < len; pos ++) {
/*
* Compute symbol probabilities by summing posterior probs
* of all states corresponding to the same symbol
*/
LHash<VocabIndex,LogP2> symbolProbs;
TrellisIter<VocabContext> iter(trellis, pos);
VocabContext context;
LogP forwProb;
while (iter.next(context, forwProb)) {
LogP2 posterior;
if (fwOnly) {
posterior = forwProb;
} else if (fb) {
posterior = forwProb + trellis.getBackLogP(context, pos);
} else {
LogP backmax;
posterior = trellis.getMax(context, pos, backmax);
posterior += backmax;
}
Boolean foundP;
LogP2 *symbolProb = symbolProbs.insert(context[0], foundP);
if (!foundP) {
*symbolProb = posterior;
} else {
*symbolProb = AddLogP(*symbolProb, posterior);
}
}
/*
* Find symbol with highest posterior
*/
LogP2 totalPosterior = LogP_Zero;
LogP2 maxPosterior = LogP_Zero;
VocabIndex bestSymbol = Vocab_None;
LHashIter<VocabIndex,LogP2> symbolIter(symbolProbs);
LogP2 *symbolProb;
VocabIndex symbol;
while (symbolProb = symbolIter.next(symbol)) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -