📄 lm.cc
字号:
}
} else {
totalProb +=(LogP)
(ngramStats.prob =(LogP) weight * prob);
}
stats.increment(ngramStats);
Vocab::reverse(ngram);
}
}
running(wasRunning);
/*
* If computing entropy set total number of events to 1 so that
* ppl computation reflects entropy.
*/
if (entropy) {
stats.numSentences = 0;
stats.numWords = 1;
}
return totalProb;
}
/*
* Perplexity from counts
* The escapeString is an optional line prefix that marks information
* that should be passed through unchanged. This is useful in
* constructing rescoring filters that feed hypothesis strings to
* pplCountsFile(), but also need to pass other information to downstream
* processing.
* If the entropy flag is true, the count log probabilities will be
* weighted by the joint probabilities on the ngrams. I.e., the
* output will be p(w,h) log p(pw|h) for each ngram, and the overall
* result will be the entropy of the conditional N-gram distribution.
*/
unsigned int
LM::pplCountsFile(File &file, unsigned order, TextStats &stats,
const char *escapeString, Boolean entropy)
{
char *line;
unsigned escapeLen = escapeString ? strlen(escapeString) : 0;
unsigned stateTagLen = stateTag ? strlen(stateTag) : 0;
VocabString words[maxNgramOrder + 1];
makeArray(VocabIndex, wids, order + 1);
NgramStats *counts = 0;
TextStats sentenceStats;
while (line = file.getline()) {
if (escapeString && strncmp(line, escapeString, escapeLen) == 0) {
/*
* Output sentence-level statistics before each escaped line
*/
if (counts) {
countsProb(*counts, sentenceStats, order, entropy);
if (debug(DEBUG_PRINT_SENT_PROBS)) {
dout() << sentenceStats << endl;
}
stats.increment(sentenceStats);
sentenceStats.reset();
delete counts;
counts = 0;
}
dout() << line;
continue;
}
/*
* check for directives to change the global LM state
*/
if (stateTag && strncmp(line, stateTag, stateTagLen) == 0) {
/*
* pass the state info the lm to let it do whatever
* it wants with it
*/
setState(&line[stateTagLen]);
continue;
}
if (!counts) {
counts = new NgramStats(vocab, order);
assert(counts != 0);
}
NgramCount count;
unsigned howmany =
counts->parseNgram(line, words, maxNgramOrder + 1, count);
/*
* Skip this entry if the length of the ngram exceeds our
* maximum order
*/
if (howmany == 0) {
file.position() << "malformed N-gram count or more than "
<< maxNgramOrder << " words per line\n";
continue;
} else if (howmany > order) {
continue;
}
/*
* Map words to indices
*/
vocab.getIndices(words, wids, order + 1, vocab.unkIndex());
/*
* Update the counts
*/
*counts->insertCount(wids) += count;
}
/*
* Output and update final sentence-level statistics
*/
if (counts) {
countsProb(*counts, sentenceStats, order, entropy);
if (debug(DEBUG_PRINT_SENT_PROBS)) {
dout() << sentenceStats << endl;
}
stats.increment(sentenceStats);
delete counts;
}
return stats.numWords;
}
/*
* Perplexity from text
* The escapeString is an optional line prefix that marks information
* that should be passed through unchanged. This is useful in
* constructing rescoring filters that feed hypothesis strings to
* pplFile(), but also need to pass other information to downstream
* processing.
*/
unsigned int
LM::pplFile(File &file, TextStats &stats, const char *escapeString)
{
char *line;
unsigned escapeLen = escapeString ? strlen(escapeString) : 0;
unsigned stateTagLen = stateTag ? strlen(stateTag) : 0;
VocabString sentence[maxWordsPerLine + 1];
unsigned totalWords = 0;
unsigned sentNo = 0;
TextStats documentStats;
Boolean printDocumentStats = false;
while (line = file.getline()) {
if (escapeString && strncmp(line, escapeString, escapeLen) == 0) {
if (sentNo > 0 && debuglevel() == DEBUG_PRINT_DOC_PROBS) {
dout() << documentStats << endl;
documentStats.reset();
printDocumentStats = true;
}
dout() << line;
continue;
}
/*
* check for directives to change the global LM state
*/
if (stateTag && strncmp(line, stateTag, stateTagLen) == 0) {
/*
* pass the state info the lm to let it do whatever
* it wants with it
*/
setState(&line[stateTagLen]);
continue;
}
sentNo ++;
unsigned int numWords =
vocab.parseWords(line, sentence, maxWordsPerLine + 1);
if (numWords == maxWordsPerLine + 1) {
file.position() << "too many words per sentence\n";
} else {
TextStats sentenceStats;
if (debug(DEBUG_PRINT_SENT_PROBS)) {
dout() << sentence << endl;
}
LogP prob = sentenceProb(sentence, sentenceStats);
totalWords += numWords;
if (debug(DEBUG_PRINT_SENT_PROBS)) {
dout() << sentenceStats << endl;
}
stats.increment(sentenceStats);
documentStats.increment(sentenceStats);
}
}
if (printDocumentStats) {
dout() << documentStats << endl;
}
return totalWords;
}
unsigned
LM::rescoreFile(File &file, double lmScale, double wtScale,
LM &oldLM, double oldLmScale, double oldWtScale,
const char *escapeString)
{
char *line;
unsigned escapeLen = escapeString ? strlen(escapeString) : 0;
unsigned stateTagLen = stateTag ? strlen(stateTag) : 0;
unsigned sentNo = 0;
while (line = file.getline()) {
if (escapeString && strncmp(line, escapeString, escapeLen) == 0) {
fputs(line, stdout);
continue;
}
/*
* check for directives to change the global LM state
*/
if (stateTag && strncmp(line, stateTag, stateTagLen) == 0) {
/*
* pass the state info the lm to let let if do whatever
* it wants with it
*/
setState(&line[stateTagLen]);
continue;
}
sentNo ++;
/*
* parse an n-best hyp from this line
*/
NBestHyp hyp;
if (!hyp.parse(line, vocab)) {
file.position() << "bad n-best hyp format\n";
} else {
hyp.decipherFix(oldLM, oldLmScale, oldWtScale);
hyp.rescore(*this, lmScale, wtScale);
// hyp.write((File)stdout, vocab);
/*
* Instead of writing only the total score back to output,
* keep all three scores: acoustic, LM, word transition penalty.
* Also, write this in straight log probs, not bytelog.
*/
fprintf(stdout, "%g %g %d",
hyp.acousticScore, hyp.languageScore, hyp.numWords);
for (unsigned i = 0; hyp.words[i] != Vocab_None; i++) {
fprintf(stdout, " %s", vocab.getWord(hyp.words[i]));
}
fprintf(stdout, "\n");
}
}
return sentNo;
}
/*
* Random sample generation
*/
VocabIndex
LM::generateWord(const VocabIndex *context)
{
/*
* Algorithm: generate random number between 0 and 1, and partition
* the interval 0..1 into pieces corresponding to the word probs.
* Chose the word whose interval contains the random value.
*/
Prob rval = drand48();
Prob totalProb = 0.0;
VocabIter iter(vocab);
VocabIndex wid;
while (totalProb <= rval && iter.next(wid)) {
if (!isNonWord(wid)) {
totalProb += LogPtoProb(wordProb(wid, context));
}
}
return wid;
}
VocabIndex *
LM::generateSentence(unsigned maxWords, VocabIndex *sentence)
{
static unsigned defaultResultSize = 0;
static VocabIndex *defaultResult = 0;
/*
* If no result buffer is supplied use our own.
*/
if (sentence == 0) {
if (maxWords + 1 > defaultResultSize) {
defaultResultSize = maxWords + 1;
if (defaultResult) {
delete defaultResult;
}
defaultResult = new VocabIndex[defaultResultSize];
assert(defaultResult != 0);
}
sentence = defaultResult;
}
/*
* Since we need to add the begin/end sentences tokens, and
* partial contexts are represented in reverse we use a second
* buffer for partial sentences.
*/
makeArray(VocabIndex, genBuffer, maxWords + 3);
unsigned last = maxWords + 2;
genBuffer[last] = Vocab_None;
genBuffer[--last] = vocab.ssIndex();
/*
* Generate words one-by-one until hitting an end-of-sentence.
*/
while (last > 0 && genBuffer[last] != vocab.seIndex()) {
last --;
genBuffer[last] = generateWord(&genBuffer[last + 1]);
}
/*
* Copy reversed sentence to output buffer
*/
unsigned i, j;
for (i = 0, j = maxWords; j > last; i++, j--) {
sentence[i] = genBuffer[j];
}
sentence[i] = Vocab_None;
return sentence;
}
VocabString *
LM::generateSentence(unsigned maxWords, VocabString *sentence)
{
static unsigned defaultResultSize = 0;
static VocabString *defaultResult = 0;
/*
* If no result buffer is supplied use our own.
*/
if (sentence == 0) {
if (maxWords + 1 > defaultResultSize) {
defaultResultSize = maxWords + 1;
if (defaultResult) {
delete defaultResult;
}
defaultResult = new VocabString[defaultResultSize];
assert(defaultResult != 0);
}
sentence = defaultResult;
}
/*
* Generate words indices, then map them to strings
*/
vocab.getWords(generateSentence(maxWords, (VocabIndex *)0),
sentence, maxWords + 1);
return sentence;
}
/*
* Context identification
* This returns a unique ID for the portion of a context used in
* computing follow-word probabilities. Used for path merging in
* lattice search (see the HTK interface).
* The length parameter returns the number of words used in context.
* The default is to return 0, to indicate all contexts are unique.
*/
void *
LM::contextID(VocabIndex word, const VocabIndex *context, unsigned &length)
{
length = Vocab::length(context);
return 0;
}
/*
* Back-off weight
* Computes the backoff weight applied to probabilities that are
* computed from a truncated context. Used for weight computation in
* lattice expansion (see Lattice::expandNodeToLM()).
*/
LogP
LM::contextBOW(const VocabIndex *context, unsigned length)
{
return LogP_One;
}
/*
* Global state changes (ignored)
*/
void
LM::setState(const char *state)
{
}
/*
* LM reading/writing (dummy)
*/
Boolean
LM::read(File &file, Boolean limitVocab)
{
cerr << "read() method not implemented\n";
return false;
}
void
LM::write(File &file)
{
cerr << "write() method not implemented\n";
}
/*
* Memory statistics
*/
void
LM::memStats(MemStats &stats)
{
stats.total += sizeof(*this);
}
/*
* Iteration over follow words
* The generic follow-word iterator enumerates all of vocab.
*/
_LM_FollowIter::_LM_FollowIter(LM &lm, const VocabIndex *context)
: myLM(lm), myContext(context), myIter(lm.vocab)
{
}
void
_LM_FollowIter::init()
{
myIter.init();
}
VocabIndex
_LM_FollowIter::next()
{
VocabIndex index = Vocab_None;
(void)myIter.next(index);
return index;
}
VocabIndex
_LM_FollowIter::next(LogP &prob)
{
VocabIndex index = Vocab_None;
(void)myIter.next(index);
if (index != Vocab_None) {
prob = myLM.wordProb(index, myContext);
}
return index;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -