📄 hmmofngrams.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 2 页
字号:
上一页 12
	 * word argument, using same context
	 */
	prevContext = context;
	prevPos = 0;

	/*
	 * Initialization:
	 * The 0 column corresponds to the <s> prefix, and we are in the
	 * INITIAL state.
	 */
	if (len > 0 && context[len - 1] == vocab.ssIndex()) {
	    prefix = len - 1;
	} else {
	    prefix = len;
	}

	/*
	 * Start the DP from scratch if the context has less than two items
	 * (including <s>).  This prevents the trellis from accumulating states
	 * over multiple sentences (which computes the right thing, but
	 * slowly).
	 */
	if (len > 1 &&
	    savedLength > 0 && savedContext[0] == context[prefix])
	{
	    /*
	     * Skip to the position where the saved
	     * context diverges from the new context.
	     */
	    for (pos = 1;
		 pos < savedLength && prefix > 0 &&
		     context[prefix - 1] == savedContext[pos];
		 pos ++, prefix --)
	    {
		prevPos = pos;
	    }

	    savedLength = pos;
	    trellis.init(pos);
	} else {
	    /*
	     * Start a DP from scratch
	     */
	    trellis.init();
	    trellis.setProb(initialState, LogP_One);
	    trellis.step();

	    savedContext[0] = context[prefix];
	    savedLength = 1;
	    pos = 1;
	}
    }

    for ( ; prefix >= 0; pos++, prefix--) {
        VocabIndex currWord;
	
	if (prefix == 0) {
	    currWord = word;
	} else {
	    currWord = context[prefix - 1];

	    /*
	     * Cache the context words for later shortcut processing
	     */
	    savedContext[savedLength ++] = currWord;
 	}

	const VocabIndex *currContext = &context[prefix];

	/*
	 * Keep track of the fact that at least one state has positive
	 * probability, needed for handling of OOVs below.
	 */
	Boolean havePosProb = false;

	TrellisIter<HMMIndex> prevIter(trellis, pos - 1);

	HMMIndex prevIndex;
	LogP prevProb;
	while (prevIter.next(prevIndex, prevProb)) {
	    HMMState *prevState = states.find(prevIndex);
	    assert(prevState != 0);

	    /*
	     * Stay in the same state, just emit a new word.
	     * (except in the initial and final states, which don't have
	     * LMs attached).
	     * Also, the </s> token cannot be emitted this way; it has
	     * to be matched by the FINAL state.
	     */
	    if (currWord != vocab.seIndex() && prevState->ngram) {
		LogP wProb =
			prevState->ngram->wordProb(currWord, currContext);

		trellis.update(prevIndex, prevIndex, wProb);

		if (wProb != LogP_Zero) {
		    havePosProb = true;
		}

		if (debug(DEBUG_TRANSITIONS)) {
		    cerr << "position = " << pos
			 << " from: " << stateVocab.getWord(prevIndex)
			 << " to: " << stateVocab.getWord(prevIndex)
			 << " word: " << vocab.getWord(currWord)
			 << " wprob = " << wProb
			 << endl;
		}
	    }

	    /*
	     * Probability of exiting the current state is the
	     * </s> probability in the associated model.
	     * NOTE: We defer computing this until we actually need it
	     * to avoid work for states that don't permit transitions.
	     */
	    LogP eosProb;
	    Boolean haveEosProb = false;

	    LHashIter<HMMIndex,Prob> transIter(prevState->transitions);
	    HMMIndex toIndex;
	    Prob *transProb;

	    while(transProb = transIter.next(toIndex)) {
		/*
		 * Emit </s> in the current state LM, 
		 * transition to next state,
		 * and emit the current word from that state.
		 */

		LogP wProb;	// word probability in new state
		/*
		 * The </s> token can only be matched by the FINAL state
		 * and vice-versa.
		 */
		if (currWord == vocab.seIndex()) {
		    if (toIndex == finalState) {
			wProb = LogP_One;
		    } else {
			continue;	// </s> only matches FINAL
		    }
		} else {
		    if (toIndex == finalState) {
			continue;	// FINAL only matches </s>
		    } else {
			HMMState *toState = states.find(toIndex);
			assert(toState != 0);

			/*
			 * For the overall model to be normalized we 
			 * force each state to emit at least one word.
			 * This means that the first word emitted by each 
			 * state has it's probability scaled by
			 * 1/(1 - P(</s>), the probability of an empty string.
			 * (This also means that the only way an empty string
			 * can be generated is by a direct transition from
			 * INITIAL to FINAL.)
			 */
			wProb = toState->ngram->wordProb(currWord, currContext)
				- (LogP)SubLogP(LogP_One,
				    toState->ngram->wordProb(vocab.seIndex(),
							     currContext));
		    }
		}

		if (!haveEosProb) {
		    eosProb = prevState->ngram ? 
			prevState->ngram->wordProb(vocab.seIndex(), currContext) :
			LogP_One;
		    haveEosProb = true;
		}

		LogP localProb = eosProb + ProbToLogP(*transProb) + wProb;

		trellis.update(prevIndex, toIndex, localProb);

		if (localProb != LogP_Zero) {
		    havePosProb = true;
		}


		if (debug(DEBUG_TRANSITIONS)) {
		    cerr << "position = " << pos
			 << " from: " << stateVocab.getWord(prevIndex)
			 << " to: " << stateVocab.getWord(toIndex)
			 << " word: " << vocab.getWord(currWord)
			 << " eosprob = " << eosProb
			 << " transprob = " << *transProb
			 << " wprob = " << wProb
			 << endl;
		}
	    }
	}

	/*
	 * Preserve the previous state probs if this iteration has yielded
	 * prob zero and we are not yet at the end of the prefix.
	 * This allows us to compute conditional probs based on
	 * truncated contexts, and to compute the total sentence probability
	 * leaving out the OOVs, as required by sentenceProb().
	 */
	if (prefix > 0 && !havePosProb) {
	    TrellisIter<HMMIndex> sIter(trellis, pos - 1);
	    HMMIndex index;
	    LogP prob;

	    while (sIter.next(index, prob)) {
		trellis.update(index, index, LogP_One);
	    }

	    if (currWord == vocab.unkIndex()) {
		stats.numOOVs ++;
	    } else {
		stats.zeroProbs ++;
	    }
	}

	trellis.step();
	prevPos = pos;
    }

    if (debug(DEBUG_STATE_PROBS)) {
	TrellisIter<HMMIndex> sIter(trellis, prevPos);
	HMMIndex index;
	LogP prob;

	dout() << endl;
	while (sIter.next(index, prob)) {
	    dout() << "P(" << stateVocab.getWord(index) <<") = "
		   << LogPtoProb(prob) << endl;
	}
    }
    
    if (prevPos > 0) {
	contextProb = trellis.sumLogP(prevPos - 1);
    } else { 
	contextProb = LogP_One;
    }
    return trellis.sumLogP(prevPos);
}

/*
 * The conditional word probability is computed as
 *	p(w1 .... wk)/p(w1 ... w(k-1)
 */
LogP
HMMofNgrams::wordProb(VocabIndex word, const VocabIndex *context)
{
    LogP cProb;
    TextStats stats;
    LogP pProb = prefixProb(word, context, cProb, stats);
    return pProb - cProb;
}

LogP
HMMofNgrams::wordProbRecompute(VocabIndex word, const VocabIndex *context)
{
    LogP cProb;
    TextStats stats;
    LogP pProb = prefixProb(word, 0, cProb, stats);
    return pProb - cProb;
}

/*
 * Sentence probabilities from indices
 *	This version computes the result directly using prefixProb to
 *	avoid recomputing prefix probs for each prefix.
 */
LogP
HMMofNgrams::sentenceProb(const VocabIndex *sentence, TextStats &stats)
{
    unsigned int len = vocab.length(sentence);
    LogP totalProb;

    /*
     * The debugging machinery is not duplicated here, so just fall back
     * on the general code for that.
     */
    if (debug(DEBUG_PRINT_WORD_PROBS)) {
	totalProb = LM::sentenceProb(sentence, stats);
    } else {
	makeArray(VocabIndex, reversed, len + 2 + 1);

	/*
	 * Contexts are represented most-recent-word-first.
	 * Also, we have to prepend the sentence-begin token,
	 * and append the sentence-end token.
	 */
	len = prepareSentence(sentence, reversed, len);

	/*
	 * Invalidate cache (for efficiency only)
	 */
	savedLength = 0;

	LogP contextProb;
	totalProb = prefixProb(reversed[0], reversed + 1, contextProb, stats);

	/* 
	 * OOVs and zeroProbs are updated by prefixProb()
	 */
	stats.numSentences ++;
	stats.prob += totalProb;
	stats.numWords += len;
    }

    if (debug(DEBUG_PRINT_VITERBI)) {
	makeArray(HMMIndex, bestStates, len + 2);

	if (trellis.viterbi(bestStates, len + 2) == 0) {
	    dout() << "Viterbi backtrace failed\n";
	} else {
	    dout() << "HMMstates:";

	    for (unsigned i = 1; i <= len + 1; i ++) {
		dout() << " " << stateVocab.getWord(bestStates[i]);
	    }

	    dout() << endl;
	}
    }

    return totalProb;
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -