📄 lm.cc
字号:
/*
* LM.cc --
* Generic LM methods
*
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International. All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/LM.cc,v 1.49 2006/01/05 08:44:25 stolcke Exp $";
#endif
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>
#include <assert.h>
extern "C" {
double drand48(); /* might be missing from math.h or stdlib.h */
}
#include "LM.h"
#include "NgramStats.cc"
#include "NBest.h"
#include "Array.cc"
/*
* Debugging levels used in this file
*/
#define DEBUG_PRINT_DOC_PROBS 0
#define DEBUG_PRINT_SENT_PROBS 1
#define DEBUG_PRINT_WORD_PROBS 2
#define DEBUG_PRINT_PROB_SUMS 3
const char *defaultStateTag = "<LMstate>";
/*
* Initialization
* The LM is created with a reference to a Vocab, so various
* LMs and other objects can share one Vocab. The LM will typically
* add words to the Vocab as needed.
*/
LM::LM(Vocab &vocab)
: vocab(vocab), noiseVocab(vocab)
{
_running = false;
reverseWords = false;
stateTag = defaultStateTag;
}
LM::~LM()
{
}
/*
* Contextual word probabilities from strings
* The default method for word strings looks up the word indices
* for both the word and its context and gets its probabilities
* from the LM.
*/
LogP
LM::wordProb(VocabString word, const VocabString *context)
{
unsigned int len = vocab.length(context);
makeArray(VocabIndex, cids, len + 1);
vocab.getIndices(context, cids, len + 1, vocab.unkIndex());
LogP prob = wordProb(vocab.getIndex(word, vocab.unkIndex()), cids);
return prob;
}
/* Word probability with cached context
* Recomputes the conditional probability of a word using a context
* that is guaranteed to be identical to the last call to wordProb.
* This implementation compute prob from scratch, but the idea is that
* other language models use caches that depend on the context.
*/
LogP
LM::wordProbRecompute(VocabIndex word, const VocabIndex *context)
{
return wordProb(word, context);
}
/*
* Non-word testing
* Returns true for pseudo-word tokens that don't correspond to
* observable events (e.g., context tags or hidden events).
*/
Boolean
LM::isNonWord(VocabIndex word)
{
return vocab.isNonEvent(word);
}
/*
* Total probabilites
* For debugging purposes, compute the sum of all word probs
* in a context.
*/
Prob
LM::wordProbSum(const VocabIndex *context)
{
double total = 0.0;
VocabIter iter(vocab);
VocabIndex wid;
Boolean first = true;
/*
* prob summing interrupts sequential processing mode
*/
Boolean wasRunning = running(false);
while (iter.next(wid)) {
if (!isNonWord(wid)) {
total += LogPtoProb(first ?
wordProb(wid, context) :
wordProbRecompute(wid, context));
first = false;
}
}
running(wasRunning);
return total;
}
/*
* Sentence probabilities from strings
* The default method for sentences of word strings is to translate
* them to word index sequences and get its probability from the LM.
*/
LogP
LM::sentenceProb(const VocabString *sentence, TextStats &stats)
{
unsigned int len = vocab.length(sentence);
makeArray(VocabIndex, wids, len + 1);
vocab.getIndices(sentence, wids, len + 1, vocab.unkIndex());
LogP prob = sentenceProb(wids, stats);
return prob;
}
/*
* Convenience function that reverses a sentence (for wordProb computation),
* adds begin/end sentence tokens, and removes pause tokens.
* It returns the number of words excluding these special tokens.
*/
unsigned
LM::prepareSentence(const VocabIndex *sentence, VocabIndex *reversed,
unsigned len)
{
unsigned i, j = 0;
/*
* Add </s> token if now already there.
*/
if (sentence[reverseWords ? 0 : len - 1] != vocab.seIndex()) {
reversed[j++] = vocab.seIndex();
}
for (i = 1; i <= len; i++) {
VocabIndex word = sentence[reverseWords ? i - 1 : len - i];
if (word == vocab.pauseIndex() || noiseVocab.getWord(word)) {
continue;
}
reversed[j++] = word;
}
/*
* Add <s> token if not already there
*/
if (sentence[reverseWords ? len - 1 : 0] != vocab.ssIndex()) {
reversed[j++] = vocab.ssIndex();
}
reversed[j] = Vocab_None;
return j - 2;
}
/*
* Convenience functions that strips noise and pause tags from a words string
*/
VocabIndex *
LM::removeNoise(VocabIndex *words)
{
unsigned from, to;
for (from = 0, to = 0; words[from] != Vocab_None; from ++) {
if (words[from] != vocab.pauseIndex() &&
!noiseVocab.getWord(words[from]))
{
words[to++] = words[from];
}
}
words[to] = Vocab_None;
return words;
}
/*
* Sentence probabilities from indices
* The default method is to accumulate the contextual word
* probabilities including that of the sentence end.
*/
LogP
LM::sentenceProb(const VocabIndex *sentence, TextStats &stats)
{
unsigned int len = vocab.length(sentence);
makeArray(VocabIndex, reversed, len + 2 + 1);
int i;
/*
* Indicate to lm methods that we're in sequential processing
* mode.
*/
Boolean wasRunning = running(true);
/*
* Contexts are represented most-recent-word-first.
* Also, we have to prepend the sentence-begin token,
* and append the sentence-end token.
*/
len = prepareSentence(sentence, reversed, len);
LogP totalProb = 0.0;
unsigned totalOOVs = 0;
unsigned totalZeros = 0;
for (i = len; i >= 0; i--) {
LogP probSum;
if (debug(DEBUG_PRINT_WORD_PROBS)) {
dout() << "\tp( " << vocab.getWord(reversed[i]) << " | "
<< vocab.getWord(reversed[i+1])
<< (i < (int)len ? " ..." : " ") << ") \t= " ;
if (debug(DEBUG_PRINT_PROB_SUMS)) {
/*
* XXX: because wordProb can change the state of the LM
* we need to compute wordProbSum first.
*/
probSum = (LogP)wordProbSum(&reversed[i + 1]);
}
}
LogP prob = wordProb(reversed[i], &reversed[i + 1]);
if (debug(DEBUG_PRINT_WORD_PROBS)) {
dout() << " " << LogPtoProb(prob) << " [ " << prob << " ]";
if (debug(DEBUG_PRINT_PROB_SUMS)) {
dout() << " / " << probSum;
if (fabs(probSum - 1.0) > 0.0001) {
cerr << "\nwarning: word probs for this context sum to "
<< probSum << " != 1 : "
<< (vocab.use(), &reversed[i + 1]) << endl;
}
}
dout() << endl;
}
/*
* If the probability returned is zero but the
* word in question is <unk> we assume this is closed-vocab
* model and count it as an OOV. (This allows open-vocab
* models to return regular probabilties for <unk>.)
* If this happens and the word is not <unk> then we are
* dealing with a broken language model that return
* zero probabilities for known words, and we count them
* as a "zeroProb".
*/
if (prob == LogP_Zero) {
if (reversed[i] == vocab.unkIndex()) {
totalOOVs ++;
} else {
totalZeros ++;
}
} else {
totalProb += prob;
}
}
running(wasRunning);
/*
* Update stats with this sentence
*/
stats.numSentences ++;
stats.numWords += len;
stats.numOOVs += totalOOVs;
stats.zeroProbs += totalZeros;
stats.prob += totalProb;
return totalProb;
}
/*
* Compute joint probability of a word context (a reversed word sequence)
*/
LogP
LM::contextProb(const VocabIndex *context, unsigned clength)
{
unsigned useLength = Vocab::length(context);
LogP jointProb = LogP_One;
if (clength < useLength) {
useLength = clength;
}
/*
* If the context is empty there is nothing left to do: return LogP_One
*/
if (useLength > 0) {
/*
* Turn off debugging for contextProb computation
*/
Boolean wasRunning = running(false);
/*
* The usual hack: truncate the context temporarily
*/
VocabIndex saved = context[useLength];
((VocabIndex *)context)[useLength] = Vocab_None;
/*
* Accumulate conditional probs for all words in context
*/
for (unsigned i = useLength; i > 0; i--) {
VocabIndex word = context[i - 1];
/*
* If we're computing the marginal probability of the unigram
* <s> context we have to look up </s> instead since the former
* has prob = 0.
*/
if (i == useLength && word == vocab.ssIndex()) {
word = vocab.seIndex();
}
LogP wprob = wordProb(word, &context[i]);
/*
* If word is a non-event it has probability zero in the model,
* so the best we can do is to skip it.
* Note that above mapping turns <s> into a non-non-event, so
* it will be included.
*/
if (wprob != LogP_Zero || !vocab.isNonEvent(word)) {
jointProb += wprob;
}
}
((VocabIndex *)context)[useLength] = saved;
running(wasRunning);
}
return jointProb;
}
/*
* Compute an aggregate log probability, perplexity, etc., much like
* sentenceProb, except that it uses counts instead of actual
* sentences.
*/
LogP
LM::countsProb(NgramStats &counts, TextStats &stats, unsigned countorder,
Boolean entropy)
{
makeArray(VocabIndex, ngram, countorder + 1);
LogP totalProb = 0.0;
/*
* Indicate to lm methods that we're in sequential processing
* mode.
*/
Boolean wasRunning = running(true);
/*
* Enumerate all counts up the order indicated
*/
for (unsigned i = 1; i <= countorder; i++ ) {
// use sorted enumeration in debug mode only
NgramsIter ngramIter(counts, ngram, i,
!debug(DEBUG_PRINT_WORD_PROBS) ? 0 :
vocab.compareIndex());
NgramCount *count;
/*
* This enumerates all ngrams of the given order
*/
while (count = ngramIter.next()) {
TextStats ngramStats;
/*
* Skip zero counts since they don't contribute anything to
* the probability
*/
if (*count == 0) {
continue;
}
/*
* reverse ngram for lookup
*/
Vocab::reverse(ngram);
/*
* The rest of this loop is patterned after LM::sentenceProb()
*/
if (debug(DEBUG_PRINT_WORD_PROBS)) {
dout() << "\tp( " << vocab.getWord(ngram[0]) << " | "
<< (vocab.use(), &ngram[1])
<< " ) \t= " ;
}
LogP prob = wordProb(ngram[0], &ngram[1]);
LogP jointProb = !entropy ? LogP_One :
contextProb(ngram, countorder);
Prob weight = *count * LogPtoProb(jointProb);
if (debug(DEBUG_PRINT_WORD_PROBS)) {
dout() << " " << LogPtoProb(prob) << " [ " << prob;
/*
* Include ngram count if not unity, so we can compute the
* aggregate log probability from the output
*/
if (weight != 1.0) {
dout() << " *" << weight;
}
dout() << " ]";
if (debug(DEBUG_PRINT_PROB_SUMS)) {
Prob probSum = wordProbSum(&ngram[1]);
dout() << " / " << probSum;
if (fabs(probSum - 1.0) > 0.0001) {
cerr << "\nwarning: word probs for this context sum to "
<< probSum << " != 1 : "
<< (vocab.use(), &ngram[1]) << endl;
}
}
dout() << endl;
}
/*
* ngrams ending in </s> are counted as sentences, all others
* as words. This keeps the output compatible with that of
* LM::pplFile().
*/
if (ngram[0] == vocab.seIndex()) {
ngramStats.numSentences = *count;
} else {
ngramStats.numWords = *count;
}
/*
* If the probability returned is zero but the
* word in question is <unk> we assume this is closed-vocab
* model and count it as an OOV. (This allows open-vocab
* models to return regular probabilties for <unk>.)
* If this happens and the word is not <unk> then we are
* dealing with a broken language model that return
* zero probabilities for known words, and we count them
* as a "zeroProb".
*/
if (prob == LogP_Zero) {
if (ngram[0] == vocab.unkIndex()) {
ngramStats.numOOVs = *count;
} else {
ngramStats.zeroProbs = *count;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -