📄 hiddenngram.cc
字号:
/*
* Cache the context words for later shortcut processing
*/
savedContext[savedLength ++] = currWord;
}
const HiddenVocabProps &currWordProps = getProps(currWord);
const VocabIndex *currContext = &context[prefix];
/*
* Set up new context to transition to
* (allow enough room for one hidden event per word)
*/
VocabIndex newContext[2 * maxWordsPerLine + 1];
/*
* Iterate over all contexts for the previous position in trellis
*/
TrellisIter<HiddenNgramState> prevIter(trellis, pos - 1);
HiddenNgramState prevState;
LogP prevProb;
while (prevIter.next(prevState, prevProb)) {
VocabContext prevContext = prevState.context;
/*
* Set up the extended context. Allow room for adding
* the current word and a new hidden event.
*/
unsigned i;
for (i = 0; i < 2 * maxWordsPerLine &&
prevContext[i] != Vocab_None; i ++)
{
newContext[i + 2] = prevContext[i];
}
newContext[i + 2] = Vocab_None;
unsigned prevContextLength = i;
/*
* Iterate over all hidden events
*/
LHashIter<VocabIndex, HiddenVocabProps> eventIter(vocabProps);
VocabIndex currEvent;
HiddenVocabProps *currEventProps;
while (currEventProps = eventIter.next(currEvent)) {
/*
* Observable events are dealt with as regular words in
* the input string
*/
if (currEventProps->isObserved) {
continue;
}
/*
* While repeating words all other events are disallowed
*/
if (prevState.repeatFrom > 0 && currEvent != noEventIndex) {
continue;
}
/*
* Prepend current event and word to context
*/
VocabIndex *startNewContext = &newContext[2];
/*
* Add event to context unless it's omissible
*/
if (!(currEventProps->omitFromContext)) {
startNewContext --;
startNewContext[0] = currEvent;
}
VocabIndex *startWordContext = startNewContext;
LogP eventProb;
LogP wordProb;
unsigned repeatFrom;
VocabIndex savedContextWord = Vocab_None;
/*
* Check if we're repeating words, either by virtue of
* the current event or a pending repeat
*/
if (prevState.repeatFrom > 0) {
repeatFrom = prevState.repeatFrom;
} else if (currEventProps->repeatWords > 0) {
repeatFrom = currEventProps->repeatWords;
} else {
repeatFrom = 0;
}
/*
* Manipulate context for current word for special
* disfluency-type events
*/
if (repeatFrom > 0) {
/*
* If repeated word doesn't match current word
* we can skip extending this state.
* Note: we don't allow repeats to apply to <unk>!
*/
if (repeatFrom > prevContextLength ||
currWord != prevContext[repeatFrom - 1] ||
currWord == vocab.unkIndex())
{
continue;
}
/*
* If we're extending a previous repeat event there is
* no further charge
* Otherwise, use the prob of the repeat event itself.
*/
if (prevState.repeatFrom > 0) {
eventProb = LogP_One;
} else {
eventProb = Ngram::wordProb(currEvent, prevContext);
}
/*
* There is never a charge for the repeated word
*/
wordProb = LogP_One;
repeatFrom --;
} else {
if (currEventProps->insertWord != Vocab_None) {
/*
* We cannot leave both the event itself and
* and inserted tag in the context. Overwrite the
* former if necessary.
*/
if (currEventProps->omitFromContext) {
startNewContext --;
}
/*
* Insert designated token
*/
startNewContext[0] = currEventProps->insertWord;
}
/*
* Delete specified number of words from context
*/
unsigned i;
for (i = 0; i < currEventProps->deleteWords; i ++) {
if (startNewContext[0] == Vocab_None ||
startNewContext[0] == vocab.ssIndex())
{
break;
}
startNewContext ++;
}
/*
* Eliminate deletion events that would go past the
* start of the sentence
*/
if (i < currEventProps->deleteWords) {
continue;
}
/*
* Add current word to new context unless it's omissible
* Since the position we're storing the new word in may
* actually be part of the old context we need to save the
* old contents at that position so we can restore it later
* (for the next run through this loop).
*/
if (!(currWordProps.omitFromContext)) {
startNewContext --;
savedContextWord = startNewContext[0];
startNewContext[0] = currWord;
}
/*
* Event probability
*/
eventProb = (currEvent == noEventIndex) ? LogP_One
: Ngram::wordProb(currEvent, prevContext);
/*
* Transition prob out of previous context to current word.
* Put underlying Ngram in "running" state (for debugging)
* only when processing a "no-event" context.
*/
if (prefix == 0 && prevState.event == noEventIndex &&
currEvent == noEventIndex)
{
running(wasRunning);
}
wordProb = Ngram::wordProb(currWord, startWordContext);
if (prefix == 0 && prevState.event == noEventIndex &&
currEvent == noEventIndex)
{
running(false);
}
if (wordProb != LogP_Zero) {
havePosProb = true;
}
}
/*
* Truncate context to what is actually used by LM,
* but keep at least one word so we can recover words later.
* When inside a repeat make sure we retain the words to be
* repeated.
*/
unsigned usedLength;
Ngram::contextID(Vocab_None, startNewContext, usedLength);
if (usedLength < repeatFrom) {
assert(repeatFrom < prevContextLength);
usedLength = repeatFrom;
} else if (usedLength == 0) {
usedLength = 1;
}
VocabIndex truncatedContextWord = startNewContext[usedLength];
startNewContext[usedLength] = Vocab_None;
HiddenNgramState newState;
newState.context = startNewContext;
newState.repeatFrom = repeatFrom;
newState.event = currEvent;
if (debug(DEBUG_TRANSITIONS)) {
cerr << "POSITION = " << pos
<< " FROM: " << (vocab.use(), prevState)
<< " TO: " << (vocab.use(), newState)
<< " WORD = " << vocab.getWord(currWord)
<< " EVENT = " << vocab.getWord(currEvent)
<< " EVENTPROB = " << eventProb
<< " WORDPROB = " << wordProb
<< endl;
}
/*
* For efficiency reasons we don't update the trellis
* when at the final word. In that case we just record
* the total probability.
*/
if (prefix > 0 || debug(DEBUG_PRINT_VITERBI)) {
trellis.update(prevState, newState, eventProb + wordProb);
}
if (prefix == 0) {
logSum = (LogP)AddLogP(logSum, prevProb + eventProb + wordProb);
}
/*
* Restore newContext
*/
if (savedContextWord != Vocab_None) {
startNewContext[0] = savedContextWord;
}
startNewContext[usedLength] = truncatedContextWord;
}
}
/*
* Set noevent state probability to the previous total prefix
* probability if the current word had probability zero in all
* states, and we are not yet at the end of the prefix.
* This allows us to compute conditional probs based on
* truncated contexts, and to compute the total sentence probability
* leaving out the OOVs, as required by sentenceProb().
*/
if (prefix > 0 && !havePosProb) {
VocabIndex emptyContext[3];
emptyContext[0] = noEventIndex;
emptyContext[1] = currWord;
emptyContext[2] = Vocab_None;
HiddenNgramState emptyState;
emptyState.context = emptyContext;
emptyState.repeatFrom = 0;
emptyState.event = noEventIndex;
trellis.init(pos);
trellis.setProb(emptyState, trellis.sumLogP(pos - 1));
if (currWord == vocab.unkIndex()) {
stats.numOOVs ++;
} else {
stats.zeroProbs ++;
}
}
trellis.step();
prevPos = pos;
}
running(wasRunning);
if (prevPos > 0) {
contextProb = trellis.sumLogP(prevPos - 1);
} else {
contextProb = LogP_One;
}
return logSum;
}
/*
* The conditional word probability is computed as
* p(w1 .... wk)/p(w1 ... w(k-1)
*/
LogP
HiddenNgram::wordProb(VocabIndex word, const VocabIndex *context)
{
if (notHidden) {
/*
* In "nothidden" we assume that we are processing a token stream
* that contains overt event tokens. Hence we give event tokens
* probability zero (so they don't contribute to the perplexity),
* and we scale non-event token probabilities by
* 1/(1-sum of all event probs)
*/
if (hiddenVocab.getWord(word) != 0) {
if (running() && debug(DEBUG_NGRAM_HITS)) {
dout() << "[event]";
}
return LogP_Zero;
} else {
LogP totalWordProb = LogP_One;
VocabIter eventIter(hiddenVocab);
VocabIndex event;
Boolean wasRunning = running(false);
while (eventIter.next(event)) {
totalWordProb = (LogP)SubLogP(totalWordProb,
Ngram::wordProb(event, context));
}
running(wasRunning);
return Ngram::wordProb(word, context) - totalWordProb;
}
} else {
/*
* Standard hidden event mode: do that dynamic programming thing...
*/
LogP cProb;
TextStats stats;
LogP pProb = prefixProb(word, context, cProb, stats);
return pProb - cProb;
}
}
LogP
HiddenNgram::wordProbRecompute(VocabIndex word, const VocabIndex *context)
{
if (notHidden) {
return wordProb(word, context);
} else {
LogP cProb;
TextStats stats;
LogP pProb = prefixProb(word, 0, cProb, stats);
return pProb - cProb;
}
}
/*
* Sentence probabilities from indices
* This version computes the result directly using prefixProb to
* avoid recomputing prefix probs for each prefix.
*/
LogP
HiddenNgram::sentenceProb(const VocabIndex *sentence, TextStats &stats)
{
unsigned int len = vocab.length(sentence);
LogP totalProb;
/*
* The debugging machinery is not duplicated here, so just fall back
* on the general code for that.
*/
if (notHidden || debug(DEBUG_PRINT_WORD_PROBS)) {
totalProb = Ngram::sentenceProb(sentence, stats);
} else {
/*
* Contexts are represented most-recent-word-first.
* Also, we have to prepend the sentence-begin token,
* and append the sentence-end token.
*/
makeArray(VocabIndex, reversed, len + 2 + 1);
len = prepareSentence(sentence, reversed, len);
/*
* Invalidate cache (for efficiency only)
*/
savedLength = 0;
LogP contextProb;
totalProb = prefixProb(reversed[0], reversed + 1, contextProb, stats);
/*
* OOVs and zeroProbs are updated by prefixProb()
*/
stats.numSentences ++;
stats.prob += totalProb;
stats.numWords += len;
}
if (debug(DEBUG_PRINT_VITERBI)) {
len = trellis.where();
makeArray(HiddenNgramState, bestStates, len);
if (trellis.viterbi(bestStates, len) == 0) {
dout() << "Viterbi backtrace failed\n";
} else {
dout() << "Hidden events:";
for (unsigned i = 1; i < len; i ++) {
dout() << " " << vocab.getWord(bestStates[i].event);
}
dout() << endl;
}
}
return totalProb;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -