📄 hiddenngram.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 2 页
字号:
上一页 12

	    /*
	     * Cache the context words for later shortcut processing
	     */
	    savedContext[savedLength ++] = currWord;
	}

	const HiddenVocabProps &currWordProps = getProps(currWord);

	const VocabIndex *currContext = &context[prefix];

        /*
         * Set up new context to transition to
         * (allow enough room for one hidden event per word)
         */
        VocabIndex newContext[2 * maxWordsPerLine + 1];

	/*
	 * Iterate over all contexts for the previous position in trellis
	 */
	TrellisIter<HiddenNgramState> prevIter(trellis, pos - 1);

	HiddenNgramState prevState;
	LogP prevProb;

	while (prevIter.next(prevState, prevProb)) {
	    VocabContext prevContext = prevState.context;

            /*
             * Set up the extended context.  Allow room for adding 
             * the current word and a new hidden event.
             */
            unsigned i;
            for (i = 0; i < 2 * maxWordsPerLine && 
                                    prevContext[i] != Vocab_None; i ++)
            {
                newContext[i + 2] = prevContext[i];
            }
            newContext[i + 2] = Vocab_None;

	    unsigned prevContextLength = i;

            /*
             * Iterate over all hidden events
             */
	    LHashIter<VocabIndex, HiddenVocabProps> eventIter(vocabProps);
            VocabIndex currEvent;
	    HiddenVocabProps *currEventProps;

            while (currEventProps = eventIter.next(currEvent)) {
		/*
		 * Observable events are dealt with as regular words in 
		 * the input string
		 */
		if (currEventProps->isObserved) {
		    continue;
		}

		/* 
		 * While repeating words all other events are disallowed
		 */
		if (prevState.repeatFrom > 0 && currEvent != noEventIndex) {
		    continue;
		}

                /*
                 * Prepend current event and word to context
                 */ 
		VocabIndex *startNewContext = &newContext[2];

		/*
		 * Add event to context unless it's omissible
		 */
		if (!(currEventProps->omitFromContext)) {
		    startNewContext --;
		    startNewContext[0] = currEvent;
		}

		VocabIndex *startWordContext = startNewContext;

                LogP eventProb;
		LogP wordProb;
		unsigned repeatFrom;
		VocabIndex savedContextWord = Vocab_None;

		/*
		 * Check if we're repeating words, either by virtue of 
		 * the current event or a pending repeat
		 */
		if (prevState.repeatFrom > 0) {
		    repeatFrom = prevState.repeatFrom;
		} else if (currEventProps->repeatWords > 0) {
		    repeatFrom = currEventProps->repeatWords;
		} else {
		    repeatFrom = 0;
		}

		/*
		 * Manipulate context for current word for special 
		 * disfluency-type events
		 */
		if (repeatFrom > 0) {
		    /*
		     * If repeated word doesn't match current word 
		     * we can skip extending this state.
		     * Note: we don't allow repeats to apply to <unk>!
		     */
		    if (repeatFrom > prevContextLength ||
			currWord != prevContext[repeatFrom - 1] ||
			currWord == vocab.unkIndex())
		    {
			continue;
		    }

		    /*
		     * If we're extending a previous repeat event there is
		     * no further charge
		     * Otherwise, use the prob of the repeat event itself.
		     */
		    if (prevState.repeatFrom > 0) {
			eventProb = LogP_One;
		    } else {
			eventProb = Ngram::wordProb(currEvent, prevContext);
		    }

		    /*
		     * There is never a charge for the repeated word
		     */
		    wordProb = LogP_One;

		    repeatFrom --;
		} else {
		    if (currEventProps->insertWord != Vocab_None) {
			/* 
			 * We cannot leave both the event itself and 
			 * and inserted tag in the context.  Overwrite the
			 * former if necessary.
			 */
			if (currEventProps->omitFromContext) {
			    startNewContext --;
			}

			/*
			 * Insert designated token
			 */
			startNewContext[0] = currEventProps->insertWord;
		    }

		    /*
		     * Delete specified number of words from context
		     */
		    unsigned i;
		    for (i = 0; i < currEventProps->deleteWords; i ++) {
			if (startNewContext[0] == Vocab_None ||
			    startNewContext[0] == vocab.ssIndex()) 
			{
			    break;
			}
			startNewContext ++;
		    }

		    /*
		     * Eliminate deletion events that would go past the 
		     * start of the sentence
		     */
		    if (i < currEventProps->deleteWords) {
			continue;
		    }

		    /*
		     * Add current word to new context unless it's omissible
		     * Since the position we're storing the new word in may
		     * actually be part of the old context we need to save the
		     * old contents at that position so we can restore it later
		     * (for the next run through this loop).
		     */
		    if (!(currWordProps.omitFromContext)) {
			startNewContext --;
			savedContextWord = startNewContext[0];
			startNewContext[0] = currWord;
		    }

		    /*
		     * Event probability
		     */
		    eventProb = (currEvent == noEventIndex) ? LogP_One
				      : Ngram::wordProb(currEvent, prevContext);

		    /*
		     * Transition prob out of previous context to current word.
		     * Put underlying Ngram in "running" state (for debugging)
		     * only when processing a "no-event" context.
		     */
		    if (prefix == 0 && prevState.event == noEventIndex &&
			currEvent == noEventIndex)
		    {
			running(wasRunning);
		    }
		    wordProb = Ngram::wordProb(currWord, startWordContext);

		    if (prefix == 0 && prevState.event == noEventIndex &&
			currEvent == noEventIndex)
		    {
			running(false);
		    }

		    if (wordProb != LogP_Zero) {
			havePosProb = true;
		    }
		}

                /*
                 * Truncate context to what is actually used by LM,
                 * but keep at least one word so we can recover words later.
		 * When inside a repeat make sure we retain the words to be
		 * repeated.
                 */
                unsigned usedLength;
                Ngram::contextID(Vocab_None, startNewContext, usedLength);
		if (usedLength < repeatFrom) {
		    assert(repeatFrom < prevContextLength);
		    usedLength = repeatFrom;
		} else if (usedLength == 0) {
                    usedLength = 1;
                }

                VocabIndex truncatedContextWord = startNewContext[usedLength];
                startNewContext[usedLength] = Vocab_None;
	   
		HiddenNgramState newState;
		newState.context = startNewContext;
		newState.repeatFrom = repeatFrom;
		newState.event = currEvent;

                if (debug(DEBUG_TRANSITIONS)) {
                    cerr << "POSITION = " << pos
                         << " FROM: " << (vocab.use(), prevState)
                         << " TO: " << (vocab.use(), newState)
                         << " WORD = " << vocab.getWord(currWord)
                         << " EVENT = " << vocab.getWord(currEvent)
                         << " EVENTPROB = " << eventProb
                         << " WORDPROB = " << wordProb
                         << endl;
                }

		/*
		 * For efficiency reasons we don't update the trellis
		 * when at the final word.  In that case we just record
		 * the total probability.
		 */
		if (prefix > 0 || debug(DEBUG_PRINT_VITERBI)) {
		    trellis.update(prevState, newState, eventProb + wordProb);
		}

		if (prefix == 0) {
		    logSum = (LogP)AddLogP(logSum, prevProb + eventProb + wordProb);
		}

                /*
                 * Restore newContext
                 */
		if (savedContextWord != Vocab_None) {
		    startNewContext[0] = savedContextWord;
		}
                startNewContext[usedLength] = truncatedContextWord;
            }
	}

	/*
	 * Set noevent state probability to the previous total prefix
	 * probability if the current word had probability zero in all
	 * states, and we are not yet at the end of the prefix.
	 * This allows us to compute conditional probs based on
	 * truncated contexts, and to compute the total sentence probability
	 * leaving out the OOVs, as required by sentenceProb().
	 */
	if (prefix > 0 && !havePosProb) {
	    VocabIndex emptyContext[3];
	    emptyContext[0] = noEventIndex;
	    emptyContext[1] = currWord;
	    emptyContext[2] = Vocab_None;
	    HiddenNgramState emptyState;
	    emptyState.context = emptyContext;
	    emptyState.repeatFrom = 0;
	    emptyState.event = noEventIndex;

	    trellis.init(pos);
	    trellis.setProb(emptyState, trellis.sumLogP(pos - 1));

	    if (currWord == vocab.unkIndex()) {
		stats.numOOVs ++;
	    } else {
	        stats.zeroProbs ++;
	    }
	}
	
	trellis.step();
	prevPos = pos;
    }

    running(wasRunning);
    
    if (prevPos > 0) {
	contextProb = trellis.sumLogP(prevPos - 1);
    } else { 
	contextProb = LogP_One;
    }
    return logSum;
}

/*
 * The conditional word probability is computed as
 *	p(w1 .... wk)/p(w1 ... w(k-1)
 */
LogP
HiddenNgram::wordProb(VocabIndex word, const VocabIndex *context)
{
    if (notHidden) {
	/*
	 * In "nothidden" we assume that we are processing a token stream
	 * that contains overt event tokens.  Hence we give event tokens
	 * probability zero (so they don't contribute to the perplexity),
	 * and we scale non-event token probabilities by
	 *		1/(1-sum of all event probs)
	 */
	if (hiddenVocab.getWord(word) != 0) {
	    if (running() && debug(DEBUG_NGRAM_HITS)) {
		dout() << "[event]";
	    }
	    return LogP_Zero;
	} else {
	    LogP totalWordProb = LogP_One;

	    VocabIter eventIter(hiddenVocab);
	    VocabIndex event;

	    Boolean wasRunning = running(false);
	    while (eventIter.next(event)) {
		totalWordProb = (LogP)SubLogP(totalWordProb,
					    Ngram::wordProb(event, context));
	    }
	    running(wasRunning);

	    return Ngram::wordProb(word, context) - totalWordProb;
	}
    } else {
	/*
	 * Standard hidden event mode: do that dynamic programming thing...
	 */
	LogP cProb;
	TextStats stats;
	LogP pProb = prefixProb(word, context, cProb, stats);
	return pProb - cProb;
    }
}

LogP
HiddenNgram::wordProbRecompute(VocabIndex word, const VocabIndex *context)
{
    if (notHidden) {
	return wordProb(word, context);
    } else {
	LogP cProb;
	TextStats stats;
	LogP pProb = prefixProb(word, 0, cProb, stats);
	return pProb - cProb;
    }
}

/*
 * Sentence probabilities from indices
 *	This version computes the result directly using prefixProb to
 *	avoid recomputing prefix probs for each prefix.
 */
LogP
HiddenNgram::sentenceProb(const VocabIndex *sentence, TextStats &stats)
{
    unsigned int len = vocab.length(sentence);
    LogP totalProb;

    /*
     * The debugging machinery is not duplicated here, so just fall back
     * on the general code for that.
     */
    if (notHidden || debug(DEBUG_PRINT_WORD_PROBS)) {
	totalProb = Ngram::sentenceProb(sentence, stats);
    } else {
	/*
	 * Contexts are represented most-recent-word-first.
	 * Also, we have to prepend the sentence-begin token,
	 * and append the sentence-end token.
	 */
	makeArray(VocabIndex, reversed, len + 2 + 1);
	len = prepareSentence(sentence, reversed, len);

	/*
	 * Invalidate cache (for efficiency only)
	 */
	savedLength = 0;

	LogP contextProb;
	totalProb = prefixProb(reversed[0], reversed + 1, contextProb, stats);

	/* 
	 * OOVs and zeroProbs are updated by prefixProb()
	 */
	stats.numSentences ++;
	stats.prob += totalProb;
	stats.numWords += len;
    }

    if (debug(DEBUG_PRINT_VITERBI)) {
	len = trellis.where();
	makeArray(HiddenNgramState, bestStates, len);

	if (trellis.viterbi(bestStates, len) == 0) {
	    dout() << "Viterbi backtrace failed\n";
	} else {
	    dout() << "Hidden events:";

	    for (unsigned i = 1; i < len; i ++) {
		dout() << " " << vocab.getWord(bestStates[i].event);
	    }

	    dout() << endl;
	}
    }

    return totalProb;
}
上一页 12
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -