📄 classngram.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/*
 * ClassNgram.cc --
 *	N-gram model over word classes
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1999-2006 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/ClassNgram.cc,v 1.26 2006/01/05 20:21:27 stolcke Exp $";
#endif

#include <iostream>
using namespace std;
#include <stdlib.h>

#include "ClassNgram.h"
#include "Trellis.cc"
#include "LHash.cc"
#include "Array.cc"
#include "Map2.cc"
#include "NgramStats.cc"

#define DEBUG_ESTIMATE_WARNINGS		1	/* from Ngram.cc */
#define DEBUG_PRINT_WORD_PROBS          2	/* from LM.cc */
#define DEBUG_NGRAM_HITS		2	/* from Ngram.cc */
#define DEBUG_TRANSITIONS		4
#define DEBUG_ESTIMATES			4	/* from Ngram.cc */

/* 
 * We use pairs of strings over VocabIndex (type ClassNgramState)
 * as keys into the trellis.  Define the necessary support functions.
 */

static inline unsigned
LHash_hashKey(const ClassNgramState &key, unsigned maxBits)
{
    return (LHash_hashKey(key.classContext, maxBits) +
	    (key.classExpansion == 0 ? 0
		: LHash_hashKey(key.classExpansion, maxBits)))
	   & hashMask(maxBits);
}

static inline ClassNgramState
Map_copyKey(const ClassNgramState &key)
{
    ClassNgramState copy;

    /*
     * We need to copy the class context since it is created dynamically,
     * but not the class expansion strings, since they reside statically in
     * the LM.
     */
    copy.classContext = Map_copyKey(key.classContext);
    copy.classExpansion = key.classExpansion;
    return copy;
}

static inline void
#if __GNUC__ == 2 && __GNUC_MINOR__ <= 7
Map_freeKey(ClassNgramState key)
#else	// gcc 2.7 doesn't match f(&T) with f(T) here
Map_freeKey(ClassNgramState &key) // gcc 2.7 has problem matching this
#endif
{
    Map_freeKey(key.classContext);
}

static inline Boolean
#if __GNUC__ == 2 && __GNUC_MINOR__ <= 7
LHash_equalKey(const ClassNgramState key1, const ClassNgramState key2)
#else	// gcc 2.7 doesn't match f(&T,&T) with f(T,T) here
LHash_equalKey(const ClassNgramState &key1, const ClassNgramState &key2)
#endif
{
    /*
     * Note class expansions point into a shared structure, so can be
     * compared as pointers.
     */
    return LHash_equalKey(key1.classContext, key2.classContext) &&
	   key1.classExpansion == key2.classExpansion;
}

static inline Boolean
Map_noKeyP(const ClassNgramState &key)
{
    return (key.classContext == 0);
}

static inline void
Map_noKey(ClassNgramState &key)
{
    key.classContext = 0;
}

/*
 * output operator
 */
ostream &
operator<< (ostream &stream, const ClassNgramState &state)
{
    stream << "<" << state.classContext << ",";
    if (state.classExpansion != 0 && state.classExpansion[0] != Vocab_None) {
	stream << state.classExpansion;
    } else {
	stream << "NULL";
    }
    stream  << ">";
    return stream;
}

/*
 * LM code 
 */

ClassNgram::ClassNgram(Vocab &vocab, SubVocab &classVocab, unsigned order)
    : Ngram(vocab, order), classVocab(classVocab),
      trellis(maxWordsPerLine + 2 + 1, 0), savedLength(0), simpleNgram(false)
{
    /*
     * Make sure the classes are subset of base vocabulary 
     */
    assert(&vocab == &classVocab.baseVocab());
}

ClassNgram::~ClassNgram()
{
    clearClasses();
}

void *
ClassNgram::contextID(VocabIndex word, const VocabIndex *context,
							unsigned &length)
{
    if (simpleNgram) {
	return Ngram::contextID(word, context, length);
    } else {
	/*
	 * Due to the DP algorithm, we alway use the full context
	 * (don't inherit Ngram::contextID()).
	 */
	return LM::contextID(word, context, length);
    } 
}

LogP
ClassNgram::contextBOW(const VocabIndex *context, unsigned length)
{
    if (simpleNgram) {
	return Ngram::contextBOW(context, length);
    } else {
	/*
	 * Due to the DP algorithm, we alway use the full context
	 * (don't inherit Ngram::contextBOW()).
	 */
	return LM::contextBOW(context, length);
    } 
}

Boolean
ClassNgram::isNonWord(VocabIndex word)
{
    /*
     * classes are not words: duh!
     */
    return Ngram::isNonWord(word) || 
	    !simpleNgram && classVocab.getWord(word) != 0;
}

/*
 * Compute the prefix probability of word string (in reversed order)
 * taking hidden events into account.
 * This is done by dynamic programming, filling in the trellis[]
 * array from left to right.
 * Entry trellis[i][state].prob is the probability that of the first
 * i words while being in state.
 * The total prefix probability is the column sum in trellis[],
 * summing over all states.
 * For efficiency reasons, we specify the last word separately.
 * If context == 0, reuse the results from the previous call.
 */
LogP
ClassNgram::prefixProb(VocabIndex word, const VocabIndex *context,
					    LogP &contextProb, TextStats &stats)
{
    /*
     * pos points to the column currently being computed (we skip over the
     *     initial <s> token)
     * prefix points to the tail of context that is used for conditioning
     *     the current word.
     */
    unsigned pos;
    int prefix;

    Boolean wasRunning = running(false);

    if (context == 0) {
	/*
	 * Reset the computation to the last iteration in the loop below
	 */
	pos = prevPos;
	prefix = 0;
	context = prevContext;

	trellis.init(pos);
    } else {
	unsigned len = Vocab::length(context);
	assert(len <= maxWordsPerLine);

	/*
	 * Save these for possible recomputation with different
	 * word argument, using same context
	 */
	prevContext = context;
	prevPos = 0;

	/*
	 * Initialization: The 0 column corresponds to the <s> prefix.
	 */
	VocabIndex initialContext[2];
	if (len > 0 && context[len - 1] == vocab.ssIndex()) {
	    initialContext[0] = context[len - 1];
	    initialContext[1] = Vocab_None;
	    prefix = len - 1;

	} else {
	    initialContext[0] = Vocab_None;
	    prefix = len;
	}

	ClassNgramState initialState;
	initialState.classContext = initialContext;
	initialState.classExpansion = 0;

	/*
	 * Start the DP from scratch if the context has less than two items
	 * (including <s>).  This prevents the trellis from accumulating states
	 * over multiple sentences (which computes the right thing, but
	 * slowly).
	 */
	if (len > 1 &&
	    savedLength > 0 && savedContext[0] == initialContext[0])
	{
	    /*
	     * Skip to the position where the saved
	     * context diverges from the new context.
	     */
	    for (pos = 1;
		 pos < savedLength && prefix > 0 &&
		     context[prefix - 1] == savedContext[pos];
		 pos ++, prefix --)
	    {
		prevPos = pos;
	    }

	    savedLength = pos;
	    trellis.init(pos);
	} else {
	    /*
	     * Start a DP from scratch
	     */
	    trellis.clear();
	    trellis.setProb(initialState, LogP_One);
	    trellis.step();

	    savedContext[0] = initialContext[0];
	    savedLength = 1;
	    pos = 1;
	}
    }

    LogP logSum = LogP_Zero;

    for ( ; prefix >= 0; pos++, prefix--) {
	/*
	 * Keep track of the fact that at least one state has positive
	 * probability, needed for handling of OOVs below.
	 */
	Boolean havePosProb = false;

        VocabIndex currWord;
	
	if (prefix == 0) {
	    currWord = word;
	} else {
	    currWord = context[prefix - 1];

	    /*
	     * Cache the context words for later shortcut processing
	     */
	    savedContext[savedLength ++] = currWord;
	}

	/*
	 * Put underlying Ngram in "running" state (for debugging etc.)
	 * only when processing the last (current) word to prevent
	 * redundant output.
	 */
	if (prefix == 0) {
	    running(wasRunning);
	}

	/*
	 * Iterate over all states for the previous position in trellis
	 */
	TrellisIter<ClassNgramState> prevIter(trellis, pos - 1);

	ClassNgramState prevState;
	LogP prevProb;

	while (prevIter.next(prevState, prevProb)) {
	    ClassNgramState nextState;
	    VocabIndex newContext[maxWordsPerLine + 1];

	    /*
	     * There are two ways to extend a previous state:
	     * - if the state is an incomplete class expansion, then the
	     *   current word has to match the next word in the expansion,
	     *   and the follow-state consists of the same class context and
	     *   the current expansion with the current word removed.
	     * - if the state is a complete class context, then the class 
	     *   context is extended either with the current word, or by
	     *   starting a class expansion that matches the current word.
	     */
	    if (prevState.classExpansion != 0 &&
		prevState.classExpansion[0] != Vocab_None)
	    {
		/*
		 * Skip expansions that don't match
		 */
		if (prevState.classExpansion[0] != currWord) {
		    continue;
		}

		nextState.classContext = prevState.classContext;
		nextState.classExpansion = prevState.classExpansion + 1;

		if (debug(DEBUG_TRANSITIONS)) {
		    cerr << "POSITION = " << pos
			 << " FROM: " << (vocab.use(), prevState)
			 << " TO: " << (vocab.use(), nextState)
			 << " WORD = " << vocab.getWord(currWord)
			 << endl;
		}

		havePosProb = true;

		/*
		 * For efficiency reasons we don't update the trellis
		 * when at the final word.  In that case we just record
		 * the total probability.
		 */
		if (prefix > 0) {
		    trellis.update(prevState, nextState, LogP_One);
		} else {
		    logSum = (LogP)AddLogP(logSum, prevProb);
		}
	    } else {
		/*
		 * Set up the extended context.  
		 */
		unsigned i;
		for (i = 0; i < maxWordsPerLine && 
				prevState.classContext[i] != Vocab_None; i ++)
		{
		    newContext[i + 1] = prevState.classContext[i];
		}
		newContext[i + 1] = Vocab_None;

		nextState.classContext = newContext;
		nextState.classExpansion = 0;

		/*
		 * Extend context with current word
		 */
		newContext[0] = currWord;

		/*
		 * Transition prob out of previous context to current word:
		 */
		LogP wordProb =
			Ngram::wordProb(currWord, prevState.classContext);

		if (wordProb != LogP_Zero) {
		    havePosProb = true;
		}
		
                /*
                 * Truncate context to what is actually used by LM.
                 */
                unsigned usedLength;
                Ngram::contextID(Vocab_None, newContext, usedLength);

                VocabIndex truncatedContextWord = newContext[usedLength];
                newContext[usedLength] = Vocab_None;
	   
		if (debug(DEBUG_TRANSITIONS)) {
		    cerr << "POSITION = " << pos
			 << " FROM: " << (vocab.use(), prevState)
			 << " TO: " << (vocab.use(), nextState)
			 << " WORD = " << vocab.getWord(currWord)
12 3 下一页
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -