📄 hidden-ngram.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 3 页
字号:
	    TrellisIter<VocabContext> iter(trellis, pos);
	    VocabContext context;
	    LogP prob;
	    while (iter.next(context, prob)) {
		trellis.setBackProb(context, LogP_One);
	    }
	}

	while (pos > 0) {
	    trellis.stepBack();

	    /*
	     * Set up new context to transition to
	     * (allow enough room for one hidden event per word)
	     */
	    VocabIndex newContext[2 * maxWordsPerLine + 3];

	    /*
	     * Iterate over all contexts for the previous position in trellis
	     */
	    TrellisIter<VocabContext> prevIter(trellis, pos - 1);
	    VocabContext prevContext;
	    LogP prevProb;

	    while (prevIter.next(prevContext, prevProb)) {

		/*
		 * A non-event token in the previous context is skipped for
		 * purposes of LM lookup
		 */
		VocabContext usedPrevContext =
					(prevContext[0] == noEventIndex) ?
						&prevContext[1] : prevContext;

		/*
		 * transition prob out of previous context to current word;
		 * skip probability of first word, since it's a constant offset
		 * for all states and usually infinity if the first word is <s>
		 */
		LogP wordProb = (pos == 1) ?
			LogP_One : lm.wordProb(wids[pos-1], usedPrevContext);

		if (wordProb == LogP_Zero) {
		    /*
		     * Zero probability due to an OOV
		     * would equalize the probabilities of all paths
		     * and make the Viterbi backtrace useless.
		     */
		    wordProb = unkProb;
		}

		/*
		 * Set up the extended context.  Allow room for adding 
		 * the current word and a new hidden event.
		 */
		unsigned i;
		for (i = 0; i < 2 * maxWordsPerLine && 
					usedPrevContext[i] != Vocab_None; i ++)
		{
		    newContext[i + 2] = usedPrevContext[i];
		}
		newContext[i + 2] = Vocab_None;
		newContext[1] = wids[pos-1];	/* prepend current word */

		/*
		 * Iterate over all hidden events
		 */
		VocabMapIter currIter(map, positionMapped ? pos - 1 : 0);
		VocabIndex currEvent;
		Prob currProb;

		while (currIter.next(currEvent, currProb)) {
		    LogP localProb = logMap ? currProb : ProbToLogP(currProb);

		    /*
		     * Prepend current event to context.
		     * Note: noEvent is represented here (but no earlier in 
		     * the context).
		     */ 
		    newContext[0] = currEvent;	

		    /*
		     * Event probability
		     */
		    LogP eventProb = (currEvent == noEventIndex) ? LogP_One
				    : lm.wordProb(currEvent, &newContext[1]);

		    /*
		     * Truncate context to what is actually used by LM,
		     * but keep at least one word so we can recover words later.
		     */
		    VocabIndex *usedNewContext = 
				(currEvent == noEventIndex) ? &newContext[1]
					    : newContext;

		    unsigned usedLength;
		    lm.contextID(usedNewContext, usedLength);
		    if (usedLength == 0) {
			usedLength = 1;
		    }
		    VocabIndex truncatedContextWord =
						usedNewContext[usedLength];
		    usedNewContext[usedLength] = Vocab_None;

		    if (debug >= DEBUG_TRANSITIONS) {
			cerr << "BACKWARD POSITION = " << pos
			     << " FROM: " << (lm.vocab.use(), prevContext)
			     << " TO: " << (lm.vocab.use(), newContext)
			     << " WORD = " << lm.vocab.getWord(wids[pos - 1])
			     << " WORDPROB = " << wordProb
			     << " EVENTPROB = " << eventProb
			     << " LOCALPROB = " << localProb
			     << endl;
		    }

		    trellis.updateBack(prevContext, newContext,
				       weightLogP(lmw, wordProb + eventProb) +
				       weightLogP(mapw, localProb));

		    /*
		     * Restore newContext
		     */
		    usedNewContext[usedLength] = truncatedContextWord;
		}
	    }
	    pos --;
	}

	if (hiddenCounts) {
	    incrementCounts(*hiddenCounts, wids[0], Vocab_None,
							emptyContext, 1.0);
	}

	/*
	 * Compute posterior symbol probabilities and extract most
	 * probable symbol for each position
	 */
	for (pos = 1; pos <= len; pos ++) {
	    /*
	     * Compute symbol probabilities by summing posterior probs
	     * of all states corresponding to the same symbol
	     */
	    LHash<VocabIndex,LogP2> symbolProbs;

	    TrellisIter<VocabContext> iter(trellis, pos);
	    VocabContext context;
	    LogP forwProb;
	    while (iter.next(context, forwProb)) {
		LogP2 posterior;
		
		if (fwOnly) {
		    posterior = forwProb;
		} else if (fb) {
		    posterior = forwProb + trellis.getBackLogP(context, pos);
		} else {
		    LogP backmax;
		    posterior = trellis.getMax(context, pos, backmax);
		    posterior += backmax;
		}

		Boolean foundP;
		LogP2 *symbolProb = symbolProbs.insert(context[0], foundP);
		if (!foundP) {
		    *symbolProb = posterior;
		} else {
		    *symbolProb = AddLogP(*symbolProb, posterior);
		}
	    }

	    /*
	     * Find symbol with highest posterior
	     */
	    LogP2 totalPosterior = LogP_Zero;
	    LogP2 maxPosterior = LogP_Zero;
	    VocabIndex bestSymbol = Vocab_None;

	    LHashIter<VocabIndex,LogP2> symbolIter(symbolProbs);
	    LogP2 *symbolProb;
	    VocabIndex symbol;
	    while (symbolProb = symbolIter.next(symbol)) {
		if (bestSymbol == Vocab_None || *symbolProb > maxPosterior) {
		    bestSymbol = symbol;
		    maxPosterior = *symbolProb;
		}
		totalPosterior = AddLogP(totalPosterior, *symbolProb);
	    }

	    if (bestSymbol == Vocab_None) {
		cerr << "no forward-backward state for position "
		     << pos << endl;
		return 0;
	    }

	    hiddenWids[0][pos - 1] = bestSymbol;

	    /*
	     * Print posterior probabilities
	     */
	    if (posteriors) {
		cout << lm.vocab.getWord(wids[pos - 1]) << "\t";

		/*
		 * Print events in sorted order
		 */
		VocabIter symbolIter(map.vocab2, true);
		VocabString symbolName;
		while (symbolName = symbolIter.next(symbol)) {
		    LogP2 *symbolProb = symbolProbs.find(symbol);
		    if (symbolProb != 0) {
			LogP2 posterior = *symbolProb - totalPosterior;

			cout << " " << symbolName
			     << " " << LogPtoProb(posterior);
		    }
		}
		cout << endl;
	    }

	    /*
	     * Accumulate hidden posterior counts, if requested
	     */
	    if (hiddenCounts) {
		iter.init();

		while (iter.next(context, forwProb)) {
		    LogP2 posterior;
		    
		    if (fwOnly) {
			posterior = forwProb;
		    } else if (fb) {
			posterior = forwProb +
					trellis.getBackLogP(context, pos);
		    } else {
			LogP backmax;
			posterior = trellis.getMax(context, pos, backmax);
			posterior += backmax;
		    }

		    posterior -= totalPosterior;	/* normalize */

		    incrementCounts(*hiddenCounts, wids[pos],
					findPrevWord(pos, wids, context),
					context, LogPtoProb(posterior));
		}
	    }
	}

	/*
	 * Return total string probability summing over all paths
	 */
	totalProb[0] = trellis.sumLogP(len);
	hiddenWids[0][len] = Vocab_None;
	return 1;
    }
}

/*
 * create dummy PosVocabMap to enumerate hiddenVocab
 */
void
makeDummyMap(PosVocabMap &map, Vocab &hiddenVocab)
{
    VocabIter viter(hiddenVocab);
    VocabIndex event;
    while (viter.next(event)) {
	map.put(0, event, logMap ? LogP_One : 1.0);
    }
}

/*
 * Get one input sentences at a time, map it to wids, 
 * disambiguate it, and print out the result
 */
void
disambiguateFile(File &file, SubVocab &hiddenVocab, LM &lm,
		 NgramCounts<NgramFractCount> *hiddenCounts, unsigned numNbest)
{
    PosVocabMap dummyMap(hiddenVocab);
    makeDummyMap(dummyMap, hiddenVocab);

    char *line;
    VocabString sentence[maxWordsPerLine];
    unsigned escapeLen = escape ? strlen(escape) : 0;

    while (line = file.getline()) {

	/*
	 * Pass escaped lines through unprocessed
	 */
        if (escape && strncmp(line, escape, escapeLen) == 0) {
	    cout << line;
	    continue;
	}

	unsigned numWords = Vocab::parseWords(line, sentence, maxWordsPerLine);
	if (numWords == maxWordsPerLine) {
	    file.position() << "too many words per sentence\n";
	} else {
	    makeArray(VocabIndex, wids, maxWordsPerLine + 1);
	    makeArray(VocabIndex *, hiddenWids, numNbest);

	    for (unsigned n = 0; n < numNbest; n++) {
		hiddenWids[n] = new VocabIndex[maxWordsPerLine + 1];
		assert(hiddenWids[n] != 0);
	    }

	    lm.vocab.getIndices(sentence, wids, maxWordsPerLine,
							  lm.vocab.unkIndex());

	    makeArray(LogP, totalProb, numNbest);
	    unsigned numHyps = disambiguateSentence(wids, hiddenWids, totalProb,
						    dummyMap, lm, hiddenCounts,
						    numNbest);
	    if (!numHyps) {
		file.position() << "Disambiguation failed\n";
	    } else if (totals) {
		cout << totalProb[0] << endl;
	    } else if (!posteriors) {
		for (unsigned n = 0; n < numHyps; n++) {
		    if (numNbest > 1) {
			cout << "NBEST_" << n << " " << totalProb[n] << " ";
		    }
		    for (unsigned i = 0; hiddenWids[n][i] != Vocab_None; i ++) {
			cout << (i > 0 ? " " : "")
			     << (keepUnk ? sentence[i] :
						lm.vocab.getWord(wids[i]));

			if (hiddenWids[n][i] != noEventIndex) {
			    cout << " " << lm.vocab.getWord(hiddenWids[n][i]);
			}
		    }
		    cout << endl;
		}
	    }
	    for (unsigned n = 0; n < numNbest; n++) {
		delete [] hiddenWids[n];
	    }
	}
    }
}

/*
 * Read entire file ignoring line breaks, map it to wids, 
 * disambiguate it, and print out the result
 */
void
disambiguateFileContinuous(File &file, SubVocab &hiddenVocab, LM &lm,
			   NgramCounts<NgramFractCount> *hiddenCounts,
			   unsigned numNbest)
{
    PosVocabMap dummyMap(hiddenVocab);
    makeDummyMap(dummyMap, hiddenVocab);

    char *line;
    Array<VocabIndex> wids;

    unsigned escapeLen = escape ? strlen(escape) : 0;
    unsigned lineStart = 0; // index into the above to mark the offset for the 
			    // current line's data


    while (line = file.getline()) {
	/*
	 * Pass escaped lines through unprocessed
	 * (although this is pretty useless in continuous mode)
	 */
        if (escape && strncmp(line, escape, escapeLen) == 0) {
	    cout << line;
	    continue;
	}

	VocabString words[maxWordsPerLine];
	unsigned numWords =
		Vocab::parseWords(line, words, maxWordsPerLine);

	if (numWords == maxWordsPerLine) {
	    file.position() << "too many words per line\n";
	} else {
	    // This effectively allocates more space
	    wids[lineStart + numWords] = Vocab_None;

	    lm.vocab.getIndices(words, &wids[lineStart], numWords,
							lm.vocab.unkIndex());
	    lineStart += numWords;
	}
    }

    if (lineStart == 0) {
	// empty input -- nothing to do
	return;
    }

    makeArray(VocabIndex *, hiddenWids, numNbest);
    makeArray(LogP, totalProb, numNbest);

    for (unsigned n = 0; n < numNbest; n++) {
	hiddenWids[n] = new VocabIndex[lineStart + 1];
	assert(hiddenWids[n] != 0);
    }

    unsigned numHyps =
		disambiguateSentence(&wids[0], hiddenWids, totalProb,
				     dummyMap, lm, hiddenCounts, numNbest);

    if (!numHyps) {
	file.position() << "Disambiguation failed\n";
    } else if (totals) {
	cout << totalProb << endl;
    } else if (!posteriors) {
	for (unsigned n = 0; n < numHyps; n++) {
	    if (numNbest > 1) {
		cout << "NBEST_" << n << " " << totalProb[n] << " ";
	    }
	    for (unsigned i = 0; hiddenWids[n][i] != Vocab_None; i ++) {
		// XXX: keepUnk not implemented yet.
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -