lm.cc

来自「这是一款很好用的工具包」· CC 代码 · 共 950 行 · 第 1/2 页
950 行
/*
 * LM.cc --
 *	Generic LM methods
 *
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/LM.cc,v 1.49 2006/01/05 08:44:25 stolcke Exp $";
#endif

#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>
#include <assert.h>

extern "C" {
	double drand48();		/* might be missing from math.h or stdlib.h */
}

#include "LM.h"
#include "NgramStats.cc"
#include "NBest.h"
#include "Array.cc"

/*
 * Debugging levels used in this file
 */
#define DEBUG_PRINT_DOC_PROBS		0
#define DEBUG_PRINT_SENT_PROBS		1
#define DEBUG_PRINT_WORD_PROBS		2
#define DEBUG_PRINT_PROB_SUMS		3

const char *defaultStateTag = "<LMstate>";

/*
 * Initialization
 *	The LM is created with a reference to a Vocab, so various
 *	LMs and other objects can share one Vocab.  The LM will typically
 *	add words to the Vocab as needed.
 */
LM::LM(Vocab &vocab)
    : vocab(vocab), noiseVocab(vocab)
{
    _running = false;
    reverseWords = false;
    stateTag = defaultStateTag;
}

LM::~LM()
{
}

/*
 * Contextual word probabilities from strings
 *	The default method for word strings looks up the word indices
 *	for both the word and its context and gets its probabilities
 *	from the LM.
 */
LogP
LM::wordProb(VocabString word, const VocabString *context)
{
    unsigned int len = vocab.length(context);
    makeArray(VocabIndex, cids, len + 1);
    vocab.getIndices(context, cids, len + 1, vocab.unkIndex());

    LogP prob = wordProb(vocab.getIndex(word, vocab.unkIndex()), cids);

    return prob;
}

/* Word probability with cached context
 *	Recomputes the conditional probability of a word using a context
 *	that is guaranteed to be identical to the last call to wordProb.
 * This implementation compute prob from scratch, but the idea is that
 * other language models use caches that depend on the context.
 */
LogP
LM::wordProbRecompute(VocabIndex word, const VocabIndex *context)
{
    return wordProb(word, context);
}

/*
 * Non-word testing
 *	Returns true for pseudo-word tokens that don't correspond to
 *	observable events (e.g., context tags or hidden events).
 */
Boolean
LM::isNonWord(VocabIndex word)
{
    return vocab.isNonEvent(word);
}

/*
 * Total probabilites
 *	For debugging purposes, compute the sum of all word probs
 *	in a context.
 */
Prob
LM::wordProbSum(const VocabIndex *context)
{
    double total = 0.0;
    VocabIter iter(vocab);
    VocabIndex wid;
    Boolean first = true;

    /*
     * prob summing interrupts sequential processing mode
     */
    Boolean wasRunning = running(false);

    while (iter.next(wid)) {
	if (!isNonWord(wid)) {
	    total += LogPtoProb(first ?
				wordProb(wid, context) :
				wordProbRecompute(wid, context));
	    first = false;
	}
    }

    running(wasRunning);
    return total;
}

/*
 * Sentence probabilities from strings
 *	The default method for sentences of word strings is to translate
 *	them to word index sequences and get its probability from the LM.
 */
LogP
LM::sentenceProb(const VocabString *sentence, TextStats &stats)
{
    unsigned int len = vocab.length(sentence);
    makeArray(VocabIndex, wids, len + 1);
    vocab.getIndices(sentence, wids, len + 1, vocab.unkIndex());

    LogP prob = sentenceProb(wids, stats);

    return prob;
}

/*
 * Convenience function that reverses a sentence (for wordProb computation),
 * adds begin/end sentence tokens, and removes pause tokens.
 * It returns the number of words excluding these special tokens.
 */
unsigned
LM::prepareSentence(const VocabIndex *sentence, VocabIndex *reversed,
								unsigned len)
{
    unsigned i, j = 0;

    /*
     * Add </s> token if now already there.
     */
    if (sentence[reverseWords ? 0 : len - 1] != vocab.seIndex()) {
	reversed[j++] = vocab.seIndex();
    }

    for (i = 1; i <= len; i++) {
	VocabIndex word = sentence[reverseWords ? i - 1 : len - i];

	if (word == vocab.pauseIndex() || noiseVocab.getWord(word)) {
	    continue;
	}

	reversed[j++] = word;
    }

    /*
     * Add <s> token if not already there
     */
    if (sentence[reverseWords ? len - 1 : 0] != vocab.ssIndex()) {
	reversed[j++] = vocab.ssIndex();
    }
    reversed[j] = Vocab_None;

    return j - 2;
}

/*
 * Convenience functions that strips noise and pause tags from a words string
 */
VocabIndex *
LM::removeNoise(VocabIndex *words)
{
    unsigned from, to;

    for (from = 0, to = 0; words[from] != Vocab_None; from ++) {
	if (words[from] != vocab.pauseIndex() &&
	    !noiseVocab.getWord(words[from]))
	{
	    words[to++] = words[from];
	}
    }
    words[to] = Vocab_None;

    return words;
}

/*
 * Sentence probabilities from indices
 *	The default method is to accumulate the contextual word
 *	probabilities including that of the sentence end.
 */
LogP
LM::sentenceProb(const VocabIndex *sentence, TextStats &stats)
{
    unsigned int len = vocab.length(sentence);
    makeArray(VocabIndex, reversed, len + 2 + 1);
    int i;

    /*
     * Indicate to lm methods that we're in sequential processing
     * mode.
     */
    Boolean wasRunning = running(true);

    /*
     * Contexts are represented most-recent-word-first.
     * Also, we have to prepend the sentence-begin token,
     * and append the sentence-end token.
     */
    len = prepareSentence(sentence, reversed, len);

    LogP totalProb = 0.0;
    unsigned totalOOVs = 0;
    unsigned totalZeros = 0;

    for (i = len; i >= 0; i--) {
	LogP probSum;

	if (debug(DEBUG_PRINT_WORD_PROBS)) {
	    dout() << "\tp( " << vocab.getWord(reversed[i]) << " | "
		   << vocab.getWord(reversed[i+1])
		   << (i < (int)len ? " ..." : " ") << ") \t= " ;

	    if (debug(DEBUG_PRINT_PROB_SUMS)) {
		/*
		 * XXX: because wordProb can change the state of the LM
		 * we need to compute wordProbSum first.
		 */
		probSum = (LogP)wordProbSum(&reversed[i + 1]);
	    }
	}

	LogP prob = wordProb(reversed[i], &reversed[i + 1]);

	if (debug(DEBUG_PRINT_WORD_PROBS)) {
	    dout() << " " << LogPtoProb(prob) << " [ " << prob << " ]";
	    if (debug(DEBUG_PRINT_PROB_SUMS)) {
		dout() << " / " << probSum;
		if (fabs(probSum - 1.0) > 0.0001) {
		    cerr << "\nwarning: word probs for this context sum to "
			 << probSum << " != 1 : " 
			 << (vocab.use(), &reversed[i + 1]) << endl;
		}
	    }
	    dout() << endl;
	}
	/*
	 * If the probability returned is zero but the
	 * word in question is <unk> we assume this is closed-vocab
	 * model and count it as an OOV.  (This allows open-vocab
	 * models to return regular probabilties for <unk>.)
	 * If this happens and the word is not <unk> then we are
	 * dealing with a broken language model that return
	 * zero probabilities for known words, and we count them
	 * as a "zeroProb".
	 */
	if (prob == LogP_Zero) {
	    if (reversed[i] == vocab.unkIndex()) {
		totalOOVs ++;
	    } else {
		totalZeros ++;
	    }
	} else {
	    totalProb += prob;
	}
    }

    running(wasRunning);

    /*
     * Update stats with this sentence
     */
    stats.numSentences ++;
    stats.numWords += len;
    stats.numOOVs += totalOOVs;
    stats.zeroProbs += totalZeros;
    stats.prob += totalProb;

    return totalProb;
}

/*
 * Compute joint probability of a word context (a reversed word sequence)
 */
LogP
LM::contextProb(const VocabIndex *context, unsigned clength)
{
    unsigned useLength = Vocab::length(context);
    LogP jointProb = LogP_One;

    if (clength < useLength) {
	useLength = clength;
    }

    /*
     * If the context is empty there is nothing left to do: return LogP_One 
     */
    if (useLength > 0) {
	/*
	 * Turn off debugging for contextProb computation
	 */
	Boolean wasRunning = running(false);

	/*
	 * The usual hack: truncate the context temporarily
	 */
	VocabIndex saved = context[useLength];
	((VocabIndex *)context)[useLength] = Vocab_None;

	/*
	 * Accumulate conditional probs for all words in context
	 */
	for (unsigned i = useLength; i > 0; i--) {
	    VocabIndex word = context[i - 1];

	    /*
	     * If we're computing the marginal probability of the unigram
	     * <s> context we have to look up </s> instead since the former
	     * has prob = 0.
	     */
	    if (i == useLength && word == vocab.ssIndex()) {
		word = vocab.seIndex();
	    }

	    LogP wprob = wordProb(word, &context[i]);

	    /*
	     * If word is a non-event it has probability zero in the model,
	     * so the best we can do is to skip it.
	     * Note that above mapping turns <s> into a non-non-event, so
	     * it will be included.
	     */
	    if (wprob != LogP_Zero || !vocab.isNonEvent(word)) {
		jointProb += wprob;
	    }
	}

	((VocabIndex *)context)[useLength] = saved;

	running(wasRunning);
    }

    return jointProb;
}

/*
 * Compute an aggregate log probability, perplexity, etc., much like
 * sentenceProb, except that it uses counts instead of actual 
 * sentences.
 */
LogP
LM::countsProb(NgramStats &counts, TextStats &stats, unsigned countorder,
							    Boolean entropy)
{
    makeArray(VocabIndex, ngram, countorder + 1);

    LogP totalProb = 0.0;

    /*
     * Indicate to lm methods that we're in sequential processing
     * mode.
     */
    Boolean wasRunning = running(true);

    /*
     * Enumerate all counts up the order indicated
     */
    for (unsigned i = 1; i <= countorder; i++ ) {
	// use sorted enumeration in debug mode only
	NgramsIter ngramIter(counts, ngram, i,
					!debug(DEBUG_PRINT_WORD_PROBS) ? 0 :
							vocab.compareIndex());

	NgramCount *count;

	/*
	 * This enumerates all ngrams of the given order
	 */
	while (count = ngramIter.next()) {
	    TextStats ngramStats;

	    /*
	     * Skip zero counts since they don't contribute anything to
	     * the probability
	     */
	    if (*count == 0) {
		continue;
	    }

	    /*
	     * reverse ngram for lookup
	     */
	    Vocab::reverse(ngram);

	    /*
	     * The rest of this loop is patterned after LM::sentenceProb()
	     */

	    if (debug(DEBUG_PRINT_WORD_PROBS)) {
		dout() << "\tp( " << vocab.getWord(ngram[0]) << " | "
		       << (vocab.use(), &ngram[1])
		       << " ) \t= " ;
	    }
	    LogP prob = wordProb(ngram[0], &ngram[1]);

	    LogP jointProb = !entropy ? LogP_One :
					contextProb(ngram, countorder);
	    Prob weight = *count * LogPtoProb(jointProb);

	    if (debug(DEBUG_PRINT_WORD_PROBS)) {
		dout() << " " << LogPtoProb(prob) << " [ " << prob;

		/* 
		 * Include ngram count if not unity, so we can compute the
		 * aggregate log probability from the output
		 */
		if (weight != 1.0) {
			dout() << " *" << weight;
		}
		dout() << " ]";

		if (debug(DEBUG_PRINT_PROB_SUMS)) {
		    Prob probSum = wordProbSum(&ngram[1]);
		    dout() << " / " << probSum;
		    if (fabs(probSum - 1.0) > 0.0001) {
			cerr << "\nwarning: word probs for this context sum to "
			     << probSum << " != 1 : " 
			     << (vocab.use(), &ngram[1]) << endl;
		    }
		}
		dout() << endl;
	    }

	    /*
	     * ngrams ending in </s> are counted as sentences, all others
	     * as words.  This keeps the output compatible with that of
	     * LM::pplFile().
	     */
	    if (ngram[0] == vocab.seIndex()) {
		ngramStats.numSentences = *count;
	    } else {
		ngramStats.numWords = *count;
	    }

	    /*
	     * If the probability returned is zero but the
	     * word in question is <unk> we assume this is closed-vocab
	     * model and count it as an OOV.  (This allows open-vocab
	     * models to return regular probabilties for <unk>.)
	     * If this happens and the word is not <unk> then we are
	     * dealing with a broken language model that return
	     * zero probabilities for known words, and we count them
	     * as a "zeroProb".
	     */
	    if (prob == LogP_Zero) {
		if (ngram[0] == vocab.unkIndex()) {
		    ngramStats.numOOVs = *count;
		} else {
		    ngramStats.zeroProbs = *count;
lm.cc - 源码说明

本页面展示了「这是一款很好用的工具包」中的 lm.cc 源码文件，采用 CC 编程语言编写，共 950 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与工具包相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?