📄 hidden-ngram.cc
字号:
TrellisIter<VocabContext> iter(trellis, pos);
VocabContext context;
LogP prob;
while (iter.next(context, prob)) {
trellis.setBackProb(context, LogP_One);
}
}
while (pos > 0) {
trellis.stepBack();
/*
* Set up new context to transition to
* (allow enough room for one hidden event per word)
*/
VocabIndex newContext[2 * maxWordsPerLine + 3];
/*
* Iterate over all contexts for the previous position in trellis
*/
TrellisIter<VocabContext> prevIter(trellis, pos - 1);
VocabContext prevContext;
LogP prevProb;
while (prevIter.next(prevContext, prevProb)) {
/*
* A non-event token in the previous context is skipped for
* purposes of LM lookup
*/
VocabContext usedPrevContext =
(prevContext[0] == noEventIndex) ?
&prevContext[1] : prevContext;
/*
* transition prob out of previous context to current word;
* skip probability of first word, since it's a constant offset
* for all states and usually infinity if the first word is <s>
*/
LogP wordProb = (pos == 1) ?
LogP_One : lm.wordProb(wids[pos-1], usedPrevContext);
if (wordProb == LogP_Zero) {
/*
* Zero probability due to an OOV
* would equalize the probabilities of all paths
* and make the Viterbi backtrace useless.
*/
wordProb = unkProb;
}
/*
* Set up the extended context. Allow room for adding
* the current word and a new hidden event.
*/
unsigned i;
for (i = 0; i < 2 * maxWordsPerLine &&
usedPrevContext[i] != Vocab_None; i ++)
{
newContext[i + 2] = usedPrevContext[i];
}
newContext[i + 2] = Vocab_None;
newContext[1] = wids[pos-1]; /* prepend current word */
/*
* Iterate over all hidden events
*/
VocabMapIter currIter(map, positionMapped ? pos - 1 : 0);
VocabIndex currEvent;
Prob currProb;
while (currIter.next(currEvent, currProb)) {
LogP localProb = logMap ? currProb : ProbToLogP(currProb);
/*
* Prepend current event to context.
* Note: noEvent is represented here (but no earlier in
* the context).
*/
newContext[0] = currEvent;
/*
* Event probability
*/
LogP eventProb = (currEvent == noEventIndex) ? LogP_One
: lm.wordProb(currEvent, &newContext[1]);
/*
* Truncate context to what is actually used by LM,
* but keep at least one word so we can recover words later.
*/
VocabIndex *usedNewContext =
(currEvent == noEventIndex) ? &newContext[1]
: newContext;
unsigned usedLength;
lm.contextID(usedNewContext, usedLength);
if (usedLength == 0) {
usedLength = 1;
}
VocabIndex truncatedContextWord =
usedNewContext[usedLength];
usedNewContext[usedLength] = Vocab_None;
if (debug >= DEBUG_TRANSITIONS) {
cerr << "BACKWARD POSITION = " << pos
<< " FROM: " << (lm.vocab.use(), prevContext)
<< " TO: " << (lm.vocab.use(), newContext)
<< " WORD = " << lm.vocab.getWord(wids[pos - 1])
<< " WORDPROB = " << wordProb
<< " EVENTPROB = " << eventProb
<< " LOCALPROB = " << localProb
<< endl;
}
trellis.updateBack(prevContext, newContext,
weightLogP(lmw, wordProb + eventProb) +
weightLogP(mapw, localProb));
/*
* Restore newContext
*/
usedNewContext[usedLength] = truncatedContextWord;
}
}
pos --;
}
if (hiddenCounts) {
incrementCounts(*hiddenCounts, wids[0], Vocab_None,
emptyContext, 1.0);
}
/*
* Compute posterior symbol probabilities and extract most
* probable symbol for each position
*/
for (pos = 1; pos <= len; pos ++) {
/*
* Compute symbol probabilities by summing posterior probs
* of all states corresponding to the same symbol
*/
LHash<VocabIndex,LogP2> symbolProbs;
TrellisIter<VocabContext> iter(trellis, pos);
VocabContext context;
LogP forwProb;
while (iter.next(context, forwProb)) {
LogP2 posterior;
if (fwOnly) {
posterior = forwProb;
} else if (fb) {
posterior = forwProb + trellis.getBackLogP(context, pos);
} else {
LogP backmax;
posterior = trellis.getMax(context, pos, backmax);
posterior += backmax;
}
Boolean foundP;
LogP2 *symbolProb = symbolProbs.insert(context[0], foundP);
if (!foundP) {
*symbolProb = posterior;
} else {
*symbolProb = AddLogP(*symbolProb, posterior);
}
}
/*
* Find symbol with highest posterior
*/
LogP2 totalPosterior = LogP_Zero;
LogP2 maxPosterior = LogP_Zero;
VocabIndex bestSymbol = Vocab_None;
LHashIter<VocabIndex,LogP2> symbolIter(symbolProbs);
LogP2 *symbolProb;
VocabIndex symbol;
while (symbolProb = symbolIter.next(symbol)) {
if (bestSymbol == Vocab_None || *symbolProb > maxPosterior) {
bestSymbol = symbol;
maxPosterior = *symbolProb;
}
totalPosterior = AddLogP(totalPosterior, *symbolProb);
}
if (bestSymbol == Vocab_None) {
cerr << "no forward-backward state for position "
<< pos << endl;
return 0;
}
hiddenWids[0][pos - 1] = bestSymbol;
/*
* Print posterior probabilities
*/
if (posteriors) {
cout << lm.vocab.getWord(wids[pos - 1]) << "\t";
/*
* Print events in sorted order
*/
VocabIter symbolIter(map.vocab2, true);
VocabString symbolName;
while (symbolName = symbolIter.next(symbol)) {
LogP2 *symbolProb = symbolProbs.find(symbol);
if (symbolProb != 0) {
LogP2 posterior = *symbolProb - totalPosterior;
cout << " " << symbolName
<< " " << LogPtoProb(posterior);
}
}
cout << endl;
}
/*
* Accumulate hidden posterior counts, if requested
*/
if (hiddenCounts) {
iter.init();
while (iter.next(context, forwProb)) {
LogP2 posterior;
if (fwOnly) {
posterior = forwProb;
} else if (fb) {
posterior = forwProb +
trellis.getBackLogP(context, pos);
} else {
LogP backmax;
posterior = trellis.getMax(context, pos, backmax);
posterior += backmax;
}
posterior -= totalPosterior; /* normalize */
incrementCounts(*hiddenCounts, wids[pos],
findPrevWord(pos, wids, context),
context, LogPtoProb(posterior));
}
}
}
/*
* Return total string probability summing over all paths
*/
totalProb[0] = trellis.sumLogP(len);
hiddenWids[0][len] = Vocab_None;
return 1;
}
}
/*
* create dummy PosVocabMap to enumerate hiddenVocab
*/
void
makeDummyMap(PosVocabMap &map, Vocab &hiddenVocab)
{
VocabIter viter(hiddenVocab);
VocabIndex event;
while (viter.next(event)) {
map.put(0, event, logMap ? LogP_One : 1.0);
}
}
/*
* Get one input sentences at a time, map it to wids,
* disambiguate it, and print out the result
*/
void
disambiguateFile(File &file, SubVocab &hiddenVocab, LM &lm,
NgramCounts<NgramFractCount> *hiddenCounts, unsigned numNbest)
{
PosVocabMap dummyMap(hiddenVocab);
makeDummyMap(dummyMap, hiddenVocab);
char *line;
VocabString sentence[maxWordsPerLine];
unsigned escapeLen = escape ? strlen(escape) : 0;
while (line = file.getline()) {
/*
* Pass escaped lines through unprocessed
*/
if (escape && strncmp(line, escape, escapeLen) == 0) {
cout << line;
continue;
}
unsigned numWords = Vocab::parseWords(line, sentence, maxWordsPerLine);
if (numWords == maxWordsPerLine) {
file.position() << "too many words per sentence\n";
} else {
makeArray(VocabIndex, wids, maxWordsPerLine + 1);
makeArray(VocabIndex *, hiddenWids, numNbest);
for (unsigned n = 0; n < numNbest; n++) {
hiddenWids[n] = new VocabIndex[maxWordsPerLine + 1];
assert(hiddenWids[n] != 0);
}
lm.vocab.getIndices(sentence, wids, maxWordsPerLine,
lm.vocab.unkIndex());
makeArray(LogP, totalProb, numNbest);
unsigned numHyps = disambiguateSentence(wids, hiddenWids, totalProb,
dummyMap, lm, hiddenCounts,
numNbest);
if (!numHyps) {
file.position() << "Disambiguation failed\n";
} else if (totals) {
cout << totalProb[0] << endl;
} else if (!posteriors) {
for (unsigned n = 0; n < numHyps; n++) {
if (numNbest > 1) {
cout << "NBEST_" << n << " " << totalProb[n] << " ";
}
for (unsigned i = 0; hiddenWids[n][i] != Vocab_None; i ++) {
cout << (i > 0 ? " " : "")
<< (keepUnk ? sentence[i] :
lm.vocab.getWord(wids[i]));
if (hiddenWids[n][i] != noEventIndex) {
cout << " " << lm.vocab.getWord(hiddenWids[n][i]);
}
}
cout << endl;
}
}
for (unsigned n = 0; n < numNbest; n++) {
delete [] hiddenWids[n];
}
}
}
}
/*
* Read entire file ignoring line breaks, map it to wids,
* disambiguate it, and print out the result
*/
void
disambiguateFileContinuous(File &file, SubVocab &hiddenVocab, LM &lm,
NgramCounts<NgramFractCount> *hiddenCounts,
unsigned numNbest)
{
PosVocabMap dummyMap(hiddenVocab);
makeDummyMap(dummyMap, hiddenVocab);
char *line;
Array<VocabIndex> wids;
unsigned escapeLen = escape ? strlen(escape) : 0;
unsigned lineStart = 0; // index into the above to mark the offset for the
// current line's data
while (line = file.getline()) {
/*
* Pass escaped lines through unprocessed
* (although this is pretty useless in continuous mode)
*/
if (escape && strncmp(line, escape, escapeLen) == 0) {
cout << line;
continue;
}
VocabString words[maxWordsPerLine];
unsigned numWords =
Vocab::parseWords(line, words, maxWordsPerLine);
if (numWords == maxWordsPerLine) {
file.position() << "too many words per line\n";
} else {
// This effectively allocates more space
wids[lineStart + numWords] = Vocab_None;
lm.vocab.getIndices(words, &wids[lineStart], numWords,
lm.vocab.unkIndex());
lineStart += numWords;
}
}
if (lineStart == 0) {
// empty input -- nothing to do
return;
}
makeArray(VocabIndex *, hiddenWids, numNbest);
makeArray(LogP, totalProb, numNbest);
for (unsigned n = 0; n < numNbest; n++) {
hiddenWids[n] = new VocabIndex[lineStart + 1];
assert(hiddenWids[n] != 0);
}
unsigned numHyps =
disambiguateSentence(&wids[0], hiddenWids, totalProb,
dummyMap, lm, hiddenCounts, numNbest);
if (!numHyps) {
file.position() << "Disambiguation failed\n";
} else if (totals) {
cout << totalProb << endl;
} else if (!posteriors) {
for (unsigned n = 0; n < numHyps; n++) {
if (numNbest > 1) {
cout << "NBEST_" << n << " " << totalProb[n] << " ";
}
for (unsigned i = 0; hiddenWids[n][i] != Vocab_None; i ++) {
// XXX: keepUnk not implemented yet.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -