📄 hiddenngram.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*
 * HiddenNgram.cc --
 *	N-gram model with hidden between-word events
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1999-2006 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/HiddenNgram.cc,v 1.16 2006/01/05 20:21:27 stolcke Exp $";
#endif

#include <iostream>
using namespace std;
#include <stdlib.h>

#include "option.h"
#include "HiddenNgram.h"
#include "Trellis.cc"
#include "LHash.cc"
#include "Array.cc"

#define DEBUG_PRINT_WORD_PROBS          2	/* from LM.cc */
#define DEBUG_NGRAM_HITS		2	/* from Ngram.cc */
#define DEBUG_PRINT_VITERBI		2
#define DEBUG_TRANSITIONS		4

const VocabString noHiddenEvent = "*noevent*";

/* 
 * We use structured of type HiddenNgramState
 * as keys into the trellis.  Define the necessary support functions.
 */

static inline unsigned
LHash_hashKey(const HiddenNgramState &key, unsigned maxBits)
{
    return (LHash_hashKey(key.context, maxBits) + key.repeatFrom + key.event)
	   & hashMask(maxBits);
}

static inline HiddenNgramState
Map_copyKey(const HiddenNgramState &key)
{
    HiddenNgramState copy;

    copy.context = Map_copyKey(key.context);
    copy.repeatFrom = key.repeatFrom;
    copy.event = key.event;
    return copy;
}

static inline void
#if __GNUC__ == 2 && __GNUC_MINOR__ <= 7
Map_freeKey(HiddenNgramState key)
#else	// gcc 2.7 doesn't match f(&T) with f(T) here
Map_freeKey(HiddenNgramState &key) // gcc 2.7 has problem matching this
#endif
{
    Map_freeKey(key.context);
}

static inline Boolean
#if __GNUC__ == 2 && __GNUC_MINOR__ <= 7
LHash_equalKey(const HiddenNgramState key1, const HiddenNgramState key2)
#else	// gcc 2.7 doesn't match f(&T,&T) with f(T,T) here
LHash_equalKey(const HiddenNgramState &key1, const HiddenNgramState &key2)
#endif
{
    return LHash_equalKey(key1.context, key2.context) &&
	   key1.repeatFrom == key2.repeatFrom &&
	   key1.event == key2.event;
}

static inline Boolean
Map_noKeyP(const HiddenNgramState &key)
{
    return (key.context == 0);
}

static inline void
Map_noKey(HiddenNgramState &key)
{
    key.context = 0;
}

/*
 * output operator
 */
ostream &
operator<< (ostream &stream, const HiddenNgramState &state)
{
    stream << "<" << state.context << "," << state.repeatFrom << ">";
    return stream;
}

/*
 * LM code 
 */

HiddenNgram::HiddenNgram(Vocab &vocab, SubVocab &hiddenVocab, unsigned order,
							    Boolean notHidden)
    : Ngram(vocab, order), hiddenVocab(hiddenVocab),
      trellis(maxWordsPerLine + 2 + 1), notHidden(notHidden), savedLength(0)
{
    /*
     * Make sure the hidden events are subset of base vocabulary 
     */
    assert(&vocab == &hiddenVocab.baseVocab());

    /*
     * Make sure noevent token is not used in LM, and add to the event
     * vocabulary.
     */
    assert(hiddenVocab.getIndex(noHiddenEvent) == Vocab_None);
    noEventIndex = hiddenVocab.addWord(noHiddenEvent);

    /*
     * Give default vocabulary props to all hidden events.
     * This is needed because we later iterate over all vocab props and
     * need to catch all event tags that way.
     */
    VocabIter viter(hiddenVocab);
    VocabIndex event;

    while (viter.next(event)) {
	Boolean foundP;
	HiddenVocabProps *props = vocabProps.insert(event, foundP);
	if (!foundP) {
	    props->insertWord = Vocab_None;
	}
    }

    /*
     * The "no-event" event is excluded from the context
     */
    vocabProps.insert(noEventIndex)->omitFromContext = true;
}

HiddenNgram::~HiddenNgram()
{
}

void *
HiddenNgram::contextID(VocabIndex word, const VocabIndex *context,
							    unsigned &length)
{
    /*
     * Due to the DP algorithm, we always use the full context
     * (don't inherit Ngram::contextID()).
     */
    return LM::contextID(word, context, length);
}

LogP
HiddenNgram::contextBOW(const VocabIndex *context, unsigned length)
{
    /*
     * Due to the DP algorithm, we always use the full context
     * (don't inherit Ngram::contextBOW()).
     */
    return LM::contextBOW(context, length);
}

/*
 * Return the vocab properties for a word
 * (return defaults if none are defined)
 */
const HiddenVocabProps &
HiddenNgram::getProps(VocabIndex word)
{
    HiddenVocabProps *props = vocabProps.find(word);
    if (props) {
	return *props;
    } else {
	static HiddenVocabProps defaultProps =
				    { 0, 0, false, false, Vocab_None };
	return defaultProps;
    }
}

Boolean
HiddenNgram::isNonWord(VocabIndex word)
{
    if (LM::isNonWord(word)) {
	return true;
    } else {
	HiddenVocabProps *props = vocabProps.find(word);

	if (!props) {
	    return false;
	} else {
	    return !props->isObserved;
	}
    }
}

Boolean
HiddenNgram::read(File &file, Boolean limitVocab)
{
    /*
     * First, read the regular N-gram model
     */
    if (!Ngram::read(file, limitVocab)) {
	return false;
    } else {
	/*
	 * Read vocab property specs from the LM file
	 */
	char *line;

	while (line = file.getline()) {
	    VocabString argv[maxWordsPerLine + 1];

	    unsigned argc = Vocab::parseWords(line, argv, maxWordsPerLine + 1);
	    if (argc > maxWordsPerLine) {
		file.position() << "too many words in line\n";
		return false;
	    }

	    if (argc == 0) {
		continue;
	    }

	    /*
	     * First word on line defines the vocabulary item
	     */
	    VocabIndex word = vocab.addWord(argv[0]);

	    static unsigned deleteWords;
	    static unsigned repeatWords;
	    static char *insertWord;
	    static int isObserved;
	    static int omitFromContext;
	    static Option options[] = {
#	    define PROP_DELETE "delete"
		{ OPT_UINT, PROP_DELETE, &deleteWords, "words to delete" },
#	    define PROP_REPEAT "repeat"
		{ OPT_UINT, PROP_REPEAT, &repeatWords, "words to repeat" },
#	    define PROP_INSERT "insert"
		{ OPT_STRING, PROP_INSERT, &insertWord, "insert word" },
#	    define PROP_OBSERVED "observed"
		{ OPT_TRUE, PROP_OBSERVED, &isObserved, "observed event" },
#	    define PROP_OMIT "omit"
		{ OPT_TRUE, PROP_OMIT, &omitFromContext, "omit from context" },
	    };

	    deleteWords = repeatWords = 0;
	    insertWord = 0;
	    isObserved = omitFromContext = 0;


	    if (Opt_Parse(argc, (char **)argv, options, Opt_Number(options), 0) != 1) {
		file.position() << "problems parsing vocabulary properties\n";
		return false;
	    }

	    HiddenVocabProps *props = vocabProps.insert(word);

	    props->deleteWords = deleteWords;
	    props->repeatWords = repeatWords;
	    props->isObserved = isObserved;
	    props->omitFromContext = omitFromContext;
	    if (insertWord) {
		props->insertWord = vocab.addWord(insertWord);
		props->isObserved = false;		/* implied */
	    } else {
		props->insertWord = Vocab_None;
	    }

	    /* 
	     * Unless event is observable, add it to hidden vocabulary
	     */
	    if (!isObserved) {
		hiddenVocab.addWord(word);
	    }
	}

	return true;
    }
}

void
HiddenNgram::write(File &file)
{
    Ngram::write(file);
    fprintf(file, "\n");

    LHashIter<VocabIndex, HiddenVocabProps> iter(vocabProps);

    HiddenVocabProps *props;
    VocabIndex word;

    while (props = iter.next(word)) {
	fprintf(file, "%s", vocab.getWord(word));

	if (props->deleteWords) {
	    fprintf(file, " -%s %u", PROP_DELETE, props->deleteWords);
	}
	if (props->repeatWords) {
	    fprintf(file, " -%s %u", PROP_DELETE, props->repeatWords);
	}
	if (props->insertWord != Vocab_None) {
	    fprintf(file, " -%s %s", PROP_INSERT,
					vocab.getWord(props->insertWord));
	}
	if (props->isObserved) {
	    fprintf(file, " -%s", PROP_OBSERVED);
	}
	if (props->omitFromContext) {
	    fprintf(file, " -%s", PROP_OMIT);
	}
	fprintf(file, "\n");
    }
}

/*
 * Compute the prefix probability of word string (in reversed order)
 * taking hidden events into account.
 * This is done by dynamic programming, filling in the trellis[]
 * array from left to right.
 * Entry trellis[i][state].prob is the probability that of the first
 * i words while being in state.
 * The total prefix probability is the column sum in trellis[],
 * summing over all states.
 * For efficiency reasons, we specify the last word separately.
 * If context == 0, reuse the results from the previous call.
 */
LogP
HiddenNgram::prefixProb(VocabIndex word, const VocabIndex *context,
					    LogP &contextProb, TextStats &stats)
{
    /*
     * pos points to the column currently being computed (we skip over the
     *     initial <s> token)
     * prefix points to the tail of context that is used for conditioning
     *     the current word.
     */
    unsigned pos;
    int prefix;

    Boolean wasRunning = running(false);

    if (context == 0) {
	/*
	 * Reset the computation to the last iteration in the loop below
	 */
	pos = prevPos;
	prefix = 0;
	context = prevContext;

	trellis.init(pos);
    } else {
	unsigned len = Vocab::length(context);
	assert(len <= maxWordsPerLine);

	/*
	 * Save these for possible recomputation with different
	 * word argument, using same context
	 */
	prevContext = context;
	prevPos = 0;

	/*
	 * Initialization:
	 * The 0 column corresponds to the <s> prefix, and we are in the
	 * no-event state.
	 */
	VocabIndex initialContext[2];
	if (len > 0 && context[len - 1] == vocab.ssIndex()) {
	    initialContext[0] = context[len - 1];
	    initialContext[1] = Vocab_None;
	    prefix = len - 1;
	} else {
	    initialContext[0] = Vocab_None;
	    prefix = len;
	}

	/*
	 * Start the DP from scratch if the context has less than two items
	 * (including <s>).  This prevents the trellis from accumulating states
	 * over multiple sentences (which computes the right thing, but
	 * slowly).
	 */
	if (len > 1 &&
	    savedLength > 0 && savedContext[0] == initialContext[0])
	{
	    /*
	     * Skip to the position where the saved
	     * context diverges from the new context.
	     */
	    for (pos = 1;
		 pos < savedLength && prefix > 0 &&
		     context[prefix - 1] == savedContext[pos];
		 pos ++, prefix --)
	    {
		prevPos = pos;
	    }

	    savedLength = pos;
	    trellis.init(pos);
	} else {
	    /*
	     * Start a DP from scratch
	     */
	    HiddenNgramState initialState;
	    initialState.context = initialContext;
	    initialState.repeatFrom = 0;
	    initialState.event = noEventIndex;

	    trellis.clear();
	    trellis.setProb(initialState, LogP_One);
	    trellis.step();

	    savedContext[0] = initialContext[0];
	    savedLength = 1;
	    pos = 1;
	}
    }

    LogP logSum = LogP_Zero;

    for ( ; prefix >= 0; pos++, prefix--) {
	/*
	 * Keep track of the fact that at least one state has positive
	 * probability, needed for handling of OOVs below.
	 */
	Boolean havePosProb = false;

        VocabIndex currWord;
	
	if (prefix == 0) {
	    currWord = word;
	} else {
	    currWord = context[prefix - 1];
12 下一页
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -