📄 hiddenngram.cc
字号:
/*
* HiddenNgram.cc --
* N-gram model with hidden between-word events
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 1999-2006 SRI International. All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/HiddenNgram.cc,v 1.16 2006/01/05 20:21:27 stolcke Exp $";
#endif
#include <iostream>
using namespace std;
#include <stdlib.h>
#include "option.h"
#include "HiddenNgram.h"
#include "Trellis.cc"
#include "LHash.cc"
#include "Array.cc"
#define DEBUG_PRINT_WORD_PROBS 2 /* from LM.cc */
#define DEBUG_NGRAM_HITS 2 /* from Ngram.cc */
#define DEBUG_PRINT_VITERBI 2
#define DEBUG_TRANSITIONS 4
const VocabString noHiddenEvent = "*noevent*";
/*
* We use structured of type HiddenNgramState
* as keys into the trellis. Define the necessary support functions.
*/
static inline unsigned
LHash_hashKey(const HiddenNgramState &key, unsigned maxBits)
{
return (LHash_hashKey(key.context, maxBits) + key.repeatFrom + key.event)
& hashMask(maxBits);
}
static inline HiddenNgramState
Map_copyKey(const HiddenNgramState &key)
{
HiddenNgramState copy;
copy.context = Map_copyKey(key.context);
copy.repeatFrom = key.repeatFrom;
copy.event = key.event;
return copy;
}
static inline void
#if __GNUC__ == 2 && __GNUC_MINOR__ <= 7
Map_freeKey(HiddenNgramState key)
#else // gcc 2.7 doesn't match f(&T) with f(T) here
Map_freeKey(HiddenNgramState &key) // gcc 2.7 has problem matching this
#endif
{
Map_freeKey(key.context);
}
static inline Boolean
#if __GNUC__ == 2 && __GNUC_MINOR__ <= 7
LHash_equalKey(const HiddenNgramState key1, const HiddenNgramState key2)
#else // gcc 2.7 doesn't match f(&T,&T) with f(T,T) here
LHash_equalKey(const HiddenNgramState &key1, const HiddenNgramState &key2)
#endif
{
return LHash_equalKey(key1.context, key2.context) &&
key1.repeatFrom == key2.repeatFrom &&
key1.event == key2.event;
}
static inline Boolean
Map_noKeyP(const HiddenNgramState &key)
{
return (key.context == 0);
}
static inline void
Map_noKey(HiddenNgramState &key)
{
key.context = 0;
}
/*
* output operator
*/
ostream &
operator<< (ostream &stream, const HiddenNgramState &state)
{
stream << "<" << state.context << "," << state.repeatFrom << ">";
return stream;
}
/*
* LM code
*/
HiddenNgram::HiddenNgram(Vocab &vocab, SubVocab &hiddenVocab, unsigned order,
Boolean notHidden)
: Ngram(vocab, order), hiddenVocab(hiddenVocab),
trellis(maxWordsPerLine + 2 + 1), notHidden(notHidden), savedLength(0)
{
/*
* Make sure the hidden events are subset of base vocabulary
*/
assert(&vocab == &hiddenVocab.baseVocab());
/*
* Make sure noevent token is not used in LM, and add to the event
* vocabulary.
*/
assert(hiddenVocab.getIndex(noHiddenEvent) == Vocab_None);
noEventIndex = hiddenVocab.addWord(noHiddenEvent);
/*
* Give default vocabulary props to all hidden events.
* This is needed because we later iterate over all vocab props and
* need to catch all event tags that way.
*/
VocabIter viter(hiddenVocab);
VocabIndex event;
while (viter.next(event)) {
Boolean foundP;
HiddenVocabProps *props = vocabProps.insert(event, foundP);
if (!foundP) {
props->insertWord = Vocab_None;
}
}
/*
* The "no-event" event is excluded from the context
*/
vocabProps.insert(noEventIndex)->omitFromContext = true;
}
HiddenNgram::~HiddenNgram()
{
}
void *
HiddenNgram::contextID(VocabIndex word, const VocabIndex *context,
unsigned &length)
{
/*
* Due to the DP algorithm, we always use the full context
* (don't inherit Ngram::contextID()).
*/
return LM::contextID(word, context, length);
}
LogP
HiddenNgram::contextBOW(const VocabIndex *context, unsigned length)
{
/*
* Due to the DP algorithm, we always use the full context
* (don't inherit Ngram::contextBOW()).
*/
return LM::contextBOW(context, length);
}
/*
* Return the vocab properties for a word
* (return defaults if none are defined)
*/
const HiddenVocabProps &
HiddenNgram::getProps(VocabIndex word)
{
HiddenVocabProps *props = vocabProps.find(word);
if (props) {
return *props;
} else {
static HiddenVocabProps defaultProps =
{ 0, 0, false, false, Vocab_None };
return defaultProps;
}
}
Boolean
HiddenNgram::isNonWord(VocabIndex word)
{
if (LM::isNonWord(word)) {
return true;
} else {
HiddenVocabProps *props = vocabProps.find(word);
if (!props) {
return false;
} else {
return !props->isObserved;
}
}
}
Boolean
HiddenNgram::read(File &file, Boolean limitVocab)
{
/*
* First, read the regular N-gram model
*/
if (!Ngram::read(file, limitVocab)) {
return false;
} else {
/*
* Read vocab property specs from the LM file
*/
char *line;
while (line = file.getline()) {
VocabString argv[maxWordsPerLine + 1];
unsigned argc = Vocab::parseWords(line, argv, maxWordsPerLine + 1);
if (argc > maxWordsPerLine) {
file.position() << "too many words in line\n";
return false;
}
if (argc == 0) {
continue;
}
/*
* First word on line defines the vocabulary item
*/
VocabIndex word = vocab.addWord(argv[0]);
static unsigned deleteWords;
static unsigned repeatWords;
static char *insertWord;
static int isObserved;
static int omitFromContext;
static Option options[] = {
# define PROP_DELETE "delete"
{ OPT_UINT, PROP_DELETE, &deleteWords, "words to delete" },
# define PROP_REPEAT "repeat"
{ OPT_UINT, PROP_REPEAT, &repeatWords, "words to repeat" },
# define PROP_INSERT "insert"
{ OPT_STRING, PROP_INSERT, &insertWord, "insert word" },
# define PROP_OBSERVED "observed"
{ OPT_TRUE, PROP_OBSERVED, &isObserved, "observed event" },
# define PROP_OMIT "omit"
{ OPT_TRUE, PROP_OMIT, &omitFromContext, "omit from context" },
};
deleteWords = repeatWords = 0;
insertWord = 0;
isObserved = omitFromContext = 0;
if (Opt_Parse(argc, (char **)argv, options, Opt_Number(options), 0) != 1) {
file.position() << "problems parsing vocabulary properties\n";
return false;
}
HiddenVocabProps *props = vocabProps.insert(word);
props->deleteWords = deleteWords;
props->repeatWords = repeatWords;
props->isObserved = isObserved;
props->omitFromContext = omitFromContext;
if (insertWord) {
props->insertWord = vocab.addWord(insertWord);
props->isObserved = false; /* implied */
} else {
props->insertWord = Vocab_None;
}
/*
* Unless event is observable, add it to hidden vocabulary
*/
if (!isObserved) {
hiddenVocab.addWord(word);
}
}
return true;
}
}
void
HiddenNgram::write(File &file)
{
Ngram::write(file);
fprintf(file, "\n");
LHashIter<VocabIndex, HiddenVocabProps> iter(vocabProps);
HiddenVocabProps *props;
VocabIndex word;
while (props = iter.next(word)) {
fprintf(file, "%s", vocab.getWord(word));
if (props->deleteWords) {
fprintf(file, " -%s %u", PROP_DELETE, props->deleteWords);
}
if (props->repeatWords) {
fprintf(file, " -%s %u", PROP_DELETE, props->repeatWords);
}
if (props->insertWord != Vocab_None) {
fprintf(file, " -%s %s", PROP_INSERT,
vocab.getWord(props->insertWord));
}
if (props->isObserved) {
fprintf(file, " -%s", PROP_OBSERVED);
}
if (props->omitFromContext) {
fprintf(file, " -%s", PROP_OMIT);
}
fprintf(file, "\n");
}
}
/*
* Compute the prefix probability of word string (in reversed order)
* taking hidden events into account.
* This is done by dynamic programming, filling in the trellis[]
* array from left to right.
* Entry trellis[i][state].prob is the probability that of the first
* i words while being in state.
* The total prefix probability is the column sum in trellis[],
* summing over all states.
* For efficiency reasons, we specify the last word separately.
* If context == 0, reuse the results from the previous call.
*/
LogP
HiddenNgram::prefixProb(VocabIndex word, const VocabIndex *context,
LogP &contextProb, TextStats &stats)
{
/*
* pos points to the column currently being computed (we skip over the
* initial <s> token)
* prefix points to the tail of context that is used for conditioning
* the current word.
*/
unsigned pos;
int prefix;
Boolean wasRunning = running(false);
if (context == 0) {
/*
* Reset the computation to the last iteration in the loop below
*/
pos = prevPos;
prefix = 0;
context = prevContext;
trellis.init(pos);
} else {
unsigned len = Vocab::length(context);
assert(len <= maxWordsPerLine);
/*
* Save these for possible recomputation with different
* word argument, using same context
*/
prevContext = context;
prevPos = 0;
/*
* Initialization:
* The 0 column corresponds to the <s> prefix, and we are in the
* no-event state.
*/
VocabIndex initialContext[2];
if (len > 0 && context[len - 1] == vocab.ssIndex()) {
initialContext[0] = context[len - 1];
initialContext[1] = Vocab_None;
prefix = len - 1;
} else {
initialContext[0] = Vocab_None;
prefix = len;
}
/*
* Start the DP from scratch if the context has less than two items
* (including <s>). This prevents the trellis from accumulating states
* over multiple sentences (which computes the right thing, but
* slowly).
*/
if (len > 1 &&
savedLength > 0 && savedContext[0] == initialContext[0])
{
/*
* Skip to the position where the saved
* context diverges from the new context.
*/
for (pos = 1;
pos < savedLength && prefix > 0 &&
context[prefix - 1] == savedContext[pos];
pos ++, prefix --)
{
prevPos = pos;
}
savedLength = pos;
trellis.init(pos);
} else {
/*
* Start a DP from scratch
*/
HiddenNgramState initialState;
initialState.context = initialContext;
initialState.repeatFrom = 0;
initialState.event = noEventIndex;
trellis.clear();
trellis.setProb(initialState, LogP_One);
trellis.step();
savedContext[0] = initialContext[0];
savedLength = 1;
pos = 1;
}
}
LogP logSum = LogP_Zero;
for ( ; prefix >= 0; pos++, prefix--) {
/*
* Keep track of the fact that at least one state has positive
* probability, needed for handling of OOVs below.
*/
Boolean havePosProb = false;
VocabIndex currWord;
if (prefix == 0) {
currWord = word;
} else {
currWord = context[prefix - 1];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -