📄 hmmofngrams.cc
字号:
* word argument, using same context
*/
prevContext = context;
prevPos = 0;
/*
* Initialization:
* The 0 column corresponds to the <s> prefix, and we are in the
* INITIAL state.
*/
if (len > 0 && context[len - 1] == vocab.ssIndex()) {
prefix = len - 1;
} else {
prefix = len;
}
/*
* Start the DP from scratch if the context has less than two items
* (including <s>). This prevents the trellis from accumulating states
* over multiple sentences (which computes the right thing, but
* slowly).
*/
if (len > 1 &&
savedLength > 0 && savedContext[0] == context[prefix])
{
/*
* Skip to the position where the saved
* context diverges from the new context.
*/
for (pos = 1;
pos < savedLength && prefix > 0 &&
context[prefix - 1] == savedContext[pos];
pos ++, prefix --)
{
prevPos = pos;
}
savedLength = pos;
trellis.init(pos);
} else {
/*
* Start a DP from scratch
*/
trellis.init();
trellis.setProb(initialState, LogP_One);
trellis.step();
savedContext[0] = context[prefix];
savedLength = 1;
pos = 1;
}
}
for ( ; prefix >= 0; pos++, prefix--) {
VocabIndex currWord;
if (prefix == 0) {
currWord = word;
} else {
currWord = context[prefix - 1];
/*
* Cache the context words for later shortcut processing
*/
savedContext[savedLength ++] = currWord;
}
const VocabIndex *currContext = &context[prefix];
/*
* Keep track of the fact that at least one state has positive
* probability, needed for handling of OOVs below.
*/
Boolean havePosProb = false;
TrellisIter<HMMIndex> prevIter(trellis, pos - 1);
HMMIndex prevIndex;
LogP prevProb;
while (prevIter.next(prevIndex, prevProb)) {
HMMState *prevState = states.find(prevIndex);
assert(prevState != 0);
/*
* Stay in the same state, just emit a new word.
* (except in the initial and final states, which don't have
* LMs attached).
* Also, the </s> token cannot be emitted this way; it has
* to be matched by the FINAL state.
*/
if (currWord != vocab.seIndex() && prevState->ngram) {
LogP wProb =
prevState->ngram->wordProb(currWord, currContext);
trellis.update(prevIndex, prevIndex, wProb);
if (wProb != LogP_Zero) {
havePosProb = true;
}
if (debug(DEBUG_TRANSITIONS)) {
cerr << "position = " << pos
<< " from: " << stateVocab.getWord(prevIndex)
<< " to: " << stateVocab.getWord(prevIndex)
<< " word: " << vocab.getWord(currWord)
<< " wprob = " << wProb
<< endl;
}
}
/*
* Probability of exiting the current state is the
* </s> probability in the associated model.
* NOTE: We defer computing this until we actually need it
* to avoid work for states that don't permit transitions.
*/
LogP eosProb;
Boolean haveEosProb = false;
LHashIter<HMMIndex,Prob> transIter(prevState->transitions);
HMMIndex toIndex;
Prob *transProb;
while(transProb = transIter.next(toIndex)) {
/*
* Emit </s> in the current state LM,
* transition to next state,
* and emit the current word from that state.
*/
LogP wProb; // word probability in new state
/*
* The </s> token can only be matched by the FINAL state
* and vice-versa.
*/
if (currWord == vocab.seIndex()) {
if (toIndex == finalState) {
wProb = LogP_One;
} else {
continue; // </s> only matches FINAL
}
} else {
if (toIndex == finalState) {
continue; // FINAL only matches </s>
} else {
HMMState *toState = states.find(toIndex);
assert(toState != 0);
/*
* For the overall model to be normalized we
* force each state to emit at least one word.
* This means that the first word emitted by each
* state has it's probability scaled by
* 1/(1 - P(</s>), the probability of an empty string.
* (This also means that the only way an empty string
* can be generated is by a direct transition from
* INITIAL to FINAL.)
*/
wProb = toState->ngram->wordProb(currWord, currContext)
- (LogP)SubLogP(LogP_One,
toState->ngram->wordProb(vocab.seIndex(),
currContext));
}
}
if (!haveEosProb) {
eosProb = prevState->ngram ?
prevState->ngram->wordProb(vocab.seIndex(), currContext) :
LogP_One;
haveEosProb = true;
}
LogP localProb = eosProb + ProbToLogP(*transProb) + wProb;
trellis.update(prevIndex, toIndex, localProb);
if (localProb != LogP_Zero) {
havePosProb = true;
}
if (debug(DEBUG_TRANSITIONS)) {
cerr << "position = " << pos
<< " from: " << stateVocab.getWord(prevIndex)
<< " to: " << stateVocab.getWord(toIndex)
<< " word: " << vocab.getWord(currWord)
<< " eosprob = " << eosProb
<< " transprob = " << *transProb
<< " wprob = " << wProb
<< endl;
}
}
}
/*
* Preserve the previous state probs if this iteration has yielded
* prob zero and we are not yet at the end of the prefix.
* This allows us to compute conditional probs based on
* truncated contexts, and to compute the total sentence probability
* leaving out the OOVs, as required by sentenceProb().
*/
if (prefix > 0 && !havePosProb) {
TrellisIter<HMMIndex> sIter(trellis, pos - 1);
HMMIndex index;
LogP prob;
while (sIter.next(index, prob)) {
trellis.update(index, index, LogP_One);
}
if (currWord == vocab.unkIndex()) {
stats.numOOVs ++;
} else {
stats.zeroProbs ++;
}
}
trellis.step();
prevPos = pos;
}
if (debug(DEBUG_STATE_PROBS)) {
TrellisIter<HMMIndex> sIter(trellis, prevPos);
HMMIndex index;
LogP prob;
dout() << endl;
while (sIter.next(index, prob)) {
dout() << "P(" << stateVocab.getWord(index) <<") = "
<< LogPtoProb(prob) << endl;
}
}
if (prevPos > 0) {
contextProb = trellis.sumLogP(prevPos - 1);
} else {
contextProb = LogP_One;
}
return trellis.sumLogP(prevPos);
}
/*
* The conditional word probability is computed as
* p(w1 .... wk)/p(w1 ... w(k-1)
*/
LogP
HMMofNgrams::wordProb(VocabIndex word, const VocabIndex *context)
{
LogP cProb;
TextStats stats;
LogP pProb = prefixProb(word, context, cProb, stats);
return pProb - cProb;
}
LogP
HMMofNgrams::wordProbRecompute(VocabIndex word, const VocabIndex *context)
{
LogP cProb;
TextStats stats;
LogP pProb = prefixProb(word, 0, cProb, stats);
return pProb - cProb;
}
/*
* Sentence probabilities from indices
* This version computes the result directly using prefixProb to
* avoid recomputing prefix probs for each prefix.
*/
LogP
HMMofNgrams::sentenceProb(const VocabIndex *sentence, TextStats &stats)
{
unsigned int len = vocab.length(sentence);
LogP totalProb;
/*
* The debugging machinery is not duplicated here, so just fall back
* on the general code for that.
*/
if (debug(DEBUG_PRINT_WORD_PROBS)) {
totalProb = LM::sentenceProb(sentence, stats);
} else {
makeArray(VocabIndex, reversed, len + 2 + 1);
/*
* Contexts are represented most-recent-word-first.
* Also, we have to prepend the sentence-begin token,
* and append the sentence-end token.
*/
len = prepareSentence(sentence, reversed, len);
/*
* Invalidate cache (for efficiency only)
*/
savedLength = 0;
LogP contextProb;
totalProb = prefixProb(reversed[0], reversed + 1, contextProb, stats);
/*
* OOVs and zeroProbs are updated by prefixProb()
*/
stats.numSentences ++;
stats.prob += totalProb;
stats.numWords += len;
}
if (debug(DEBUG_PRINT_VITERBI)) {
makeArray(HMMIndex, bestStates, len + 2);
if (trellis.viterbi(bestStates, len + 2) == 0) {
dout() << "Viterbi backtrace failed\n";
} else {
dout() << "HMMstates:";
for (unsigned i = 1; i <= len + 1; i ++) {
dout() << " " << stateVocab.getWord(bestStates[i]);
}
dout() << endl;
}
}
return totalProb;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -