📄 lm.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 2 页
字号:
上一页 12
		}
	    } else {
		totalProb +=(LogP)
		    (ngramStats.prob =(LogP) weight * prob);
	    }

	    stats.increment(ngramStats);

	    Vocab::reverse(ngram);
	}
    }

    running(wasRunning);

    /* 
     * If computing entropy set total number of events to 1 so that 
     * ppl computation reflects entropy.
     */
    if (entropy) {
	stats.numSentences = 0;
	stats.numWords = 1;
    }

    return totalProb;
}

/*
 * Perplexity from counts
 *	The escapeString is an optional line prefix that marks information
 *	that should be passed through unchanged.  This is useful in
 *	constructing rescoring filters that feed hypothesis strings to
 *	pplCountsFile(), but also need to pass other information to downstream
 *	processing.
 *	If the entropy flag is true, the count log probabilities will be 
 *	weighted by the joint probabilities on the ngrams.  I.e., the
 *	output will be p(w,h) log p(pw|h) for each ngram, and the overall 
 *	result will be the entropy of the conditional N-gram distribution.
 */
unsigned int
LM::pplCountsFile(File &file, unsigned order, TextStats &stats,
				    const char *escapeString, Boolean entropy)
{
    char *line;
    unsigned escapeLen = escapeString ? strlen(escapeString) : 0;
    unsigned stateTagLen = stateTag ? strlen(stateTag) : 0;

    VocabString words[maxNgramOrder + 1];
    makeArray(VocabIndex, wids, order + 1);
    NgramStats *counts = 0;
    TextStats sentenceStats;

    while (line = file.getline()) {

	if (escapeString && strncmp(line, escapeString, escapeLen) == 0) {
	    /*
	     * Output sentence-level statistics before each escaped line
	     */
	    if (counts) {
		countsProb(*counts, sentenceStats, order, entropy);

		if (debug(DEBUG_PRINT_SENT_PROBS)) {
		    dout() << sentenceStats << endl;
		}

		stats.increment(sentenceStats);
		sentenceStats.reset();

		delete counts;
		counts = 0;
	    }
	    dout() << line;
	    continue;
	}

	/*
	 * check for directives to change the global LM state
	 */
	if (stateTag && strncmp(line, stateTag, stateTagLen) == 0) {
	    /*
	     * pass the state info the lm to let it do whatever
	     * it wants with it
	     */
	    setState(&line[stateTagLen]);
	    continue;
	}

	if (!counts) {
	    counts = new NgramStats(vocab, order);
	    assert(counts != 0);
	}

	NgramCount count;
	unsigned howmany =
		    counts->parseNgram(line, words, maxNgramOrder + 1, count);

	/*
	 * Skip this entry if the length of the ngram exceeds our 
	 * maximum order
	 */
	if (howmany == 0) {
	    file.position() << "malformed N-gram count or more than "
			    << maxNgramOrder << " words per line\n";
	    continue;
	} else if (howmany > order) {
	    continue;
	}

	/* 
	 * Map words to indices
	 */
	vocab.getIndices(words, wids, order + 1, vocab.unkIndex());

	/*
	 *  Update the counts
	 */
	*counts->insertCount(wids) += count;
    }

    /* 
     * Output and update final sentence-level statistics
     */
    if (counts) {
	countsProb(*counts, sentenceStats, order, entropy);

	if (debug(DEBUG_PRINT_SENT_PROBS)) {
	    dout() << sentenceStats << endl;
	}

	stats.increment(sentenceStats);
	delete counts;
    }

    return stats.numWords;
}

/*
 * Perplexity from text
 *	The escapeString is an optional line prefix that marks information
 *	that should be passed through unchanged.  This is useful in
 *	constructing rescoring filters that feed hypothesis strings to
 *	pplFile(), but also need to pass other information to downstream
 *	processing.
 */
unsigned int
LM::pplFile(File &file, TextStats &stats, const char *escapeString)
{
    char *line;
    unsigned escapeLen = escapeString ? strlen(escapeString) : 0;
    unsigned stateTagLen = stateTag ? strlen(stateTag) : 0;
    VocabString sentence[maxWordsPerLine + 1];
    unsigned totalWords = 0;
    unsigned sentNo = 0;
    TextStats documentStats;
    Boolean printDocumentStats = false;

    while (line = file.getline()) {

	if (escapeString && strncmp(line, escapeString, escapeLen) == 0) {
            if (sentNo > 0 && debuglevel() == DEBUG_PRINT_DOC_PROBS) {
		dout() << documentStats << endl;
		documentStats.reset();
		printDocumentStats = true;
            }
	    dout() << line;
	    continue;
	}

	/*
	 * check for directives to change the global LM state
	 */
	if (stateTag && strncmp(line, stateTag, stateTagLen) == 0) {
	    /*
	     * pass the state info the lm to let it do whatever
	     * it wants with it
	     */
	    setState(&line[stateTagLen]);
	    continue;
	}

	sentNo ++;

	unsigned int numWords =
			vocab.parseWords(line, sentence, maxWordsPerLine + 1);

	if (numWords == maxWordsPerLine + 1) {
	    file.position() << "too many words per sentence\n";
	} else {
	    TextStats sentenceStats;

	    if (debug(DEBUG_PRINT_SENT_PROBS)) {
		dout() << sentence << endl;
	    }
	    LogP prob = sentenceProb(sentence, sentenceStats);

	    totalWords += numWords;

	    if (debug(DEBUG_PRINT_SENT_PROBS)) {
		dout() << sentenceStats << endl;
	    }

	    stats.increment(sentenceStats);
	    documentStats.increment(sentenceStats);
	}
    }

    if (printDocumentStats) {
	dout() << documentStats << endl;
    }

    return totalWords;
}

unsigned
LM::rescoreFile(File &file, double lmScale, double wtScale,
		   LM &oldLM, double oldLmScale, double oldWtScale,
		   const char *escapeString)
{
    char *line;
    unsigned escapeLen = escapeString ? strlen(escapeString) : 0;
    unsigned stateTagLen = stateTag ? strlen(stateTag) : 0;
    unsigned sentNo = 0;

    while (line = file.getline()) {

	if (escapeString && strncmp(line, escapeString, escapeLen) == 0) {
	    fputs(line, stdout);
	    continue;
	}

	/*
	 * check for directives to change the global LM state
	 */
	if (stateTag && strncmp(line, stateTag, stateTagLen) == 0) {
	    /*
	     * pass the state info the lm to let let if do whatever
	     * it wants with it
	     */
	    setState(&line[stateTagLen]);
	    continue;
	}

	sentNo ++;

	/*
	 * parse an n-best hyp from this line
	 */
	NBestHyp hyp;

	if (!hyp.parse(line, vocab)) {
	    file.position() << "bad n-best hyp format\n";
	} else {
	    hyp.decipherFix(oldLM, oldLmScale, oldWtScale);
	    hyp.rescore(*this, lmScale, wtScale);
	    // hyp.write((File)stdout, vocab);
	    /*
	     * Instead of writing only the total score back to output,
	     * keep all three scores: acoustic, LM, word transition penalty.
	     * Also, write this in straight log probs, not bytelog.
	     */
	    fprintf(stdout, "%g %g %d",
			hyp.acousticScore, hyp.languageScore, hyp.numWords);
	    for (unsigned i = 0; hyp.words[i] != Vocab_None; i++) {
		fprintf(stdout, " %s", vocab.getWord(hyp.words[i]));
	    }
	    fprintf(stdout, "\n");
	}
    }
    return sentNo;
}

/*
 * Random sample generation
 */
VocabIndex
LM::generateWord(const VocabIndex *context)
{
    /*
     * Algorithm: generate random number between 0 and 1, and partition
     * the interval 0..1 into pieces corresponding to the word probs.
     * Chose the word whose interval contains the random value.
     */
    Prob rval = drand48();
	Prob totalProb = 0.0;

    VocabIter iter(vocab);
    VocabIndex wid;

    while (totalProb <= rval && iter.next(wid)) {
	if (!isNonWord(wid)) {
	    totalProb += LogPtoProb(wordProb(wid, context));
	}
    }
    return wid;
}

VocabIndex *
LM::generateSentence(unsigned maxWords, VocabIndex *sentence)
{
    static unsigned defaultResultSize = 0;
    static VocabIndex *defaultResult = 0;

    /*
     * If no result buffer is supplied use our own.
     */
    if (sentence == 0) {
	if (maxWords + 1 > defaultResultSize) {
	    defaultResultSize = maxWords + 1;
	    if (defaultResult) {
		delete defaultResult;
	    }
	    defaultResult = new VocabIndex[defaultResultSize];
	    assert(defaultResult != 0);
	}
	sentence = defaultResult;
    }

    /*
     * Since we need to add the begin/end sentences tokens, and
     * partial contexts are represented in reverse we use a second
     * buffer for partial sentences.
     */
    makeArray(VocabIndex, genBuffer, maxWords + 3);

    unsigned last = maxWords + 2;
    genBuffer[last] = Vocab_None;
    genBuffer[--last] = vocab.ssIndex();

    /*
     * Generate words one-by-one until hitting an end-of-sentence.
     */
    while (last > 0 && genBuffer[last] != vocab.seIndex()) {
	last --;
	genBuffer[last] = generateWord(&genBuffer[last + 1]);
    }
    
    /*
     * Copy reversed sentence to output buffer
     */
    unsigned i, j;
    for (i = 0, j = maxWords; j > last; i++, j--) {
	sentence[i] = genBuffer[j];
    }
    sentence[i] = Vocab_None;

    return sentence;
}

VocabString *
LM::generateSentence(unsigned maxWords, VocabString *sentence)
{
    static unsigned defaultResultSize = 0;
    static VocabString *defaultResult = 0;

    /*
     * If no result buffer is supplied use our own.
     */
    if (sentence == 0) {
	if (maxWords + 1 > defaultResultSize) {
	    defaultResultSize = maxWords + 1;
	    if (defaultResult) {
		delete defaultResult;
	    }
	    defaultResult = new VocabString[defaultResultSize];
	    assert(defaultResult != 0);
	}
	sentence = defaultResult;
    }

    /*
     * Generate words indices, then map them to strings
     */
    vocab.getWords(generateSentence(maxWords, (VocabIndex *)0),
			    sentence, maxWords + 1);

    return sentence;
}

/*
 * Context identification
 *	This returns a unique ID for the portion of a context used in
 *	computing follow-word probabilities. Used for path merging in
 *	lattice search (see the HTK interface).
 *	The length parameter returns the number of words used in context.
 *	The default is to return 0, to indicate all contexts are unique.
 */
void *
LM::contextID(VocabIndex word, const VocabIndex *context, unsigned &length)
{
    length = Vocab::length(context);
    return 0;
}

/*
 * Back-off weight
 *	Computes the backoff weight applied to probabilities that are 
 *	computed from a truncated context.  Used for weight computation in
 *	lattice expansion (see Lattice::expandNodeToLM()).
 */
LogP
LM::contextBOW(const VocabIndex *context, unsigned length)
{
    return LogP_One;
}

/*
 * Global state changes (ignored)
 */
void
LM::setState(const char *state)
{
}

/*
 * LM reading/writing (dummy)
 */
Boolean
LM::read(File &file, Boolean limitVocab)
{
    cerr << "read() method not implemented\n";
    return false;
}

void
LM::write(File &file)
{
    cerr << "write() method not implemented\n";
}

/*
 * Memory statistics
 */
void
LM::memStats(MemStats &stats)
{
    stats.total += sizeof(*this);
}

/*
 * Iteration over follow words
 *	The generic follow-word iterator enumerates all of vocab.
 */

_LM_FollowIter::_LM_FollowIter(LM &lm, const VocabIndex *context)
    : myLM(lm), myContext(context), myIter(lm.vocab)
{
}

void
_LM_FollowIter::init()
{
    myIter.init();
}

VocabIndex
_LM_FollowIter::next()
{
    VocabIndex index = Vocab_None;
    (void)myIter.next(index);
    return index;
}

VocabIndex
_LM_FollowIter::next(LogP &prob)
{
    VocabIndex index = Vocab_None;
    (void)myIter.next(index);
    
    if (index != Vocab_None) {
	prob = myLM.wordProb(index, myContext);
    }

    return index;
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -