📄 classngram.cc
字号:
/*
* ClassNgram.cc --
* N-gram model over word classes
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 1999-2006 SRI International. All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/ClassNgram.cc,v 1.26 2006/01/05 20:21:27 stolcke Exp $";
#endif
#include <iostream>
using namespace std;
#include <stdlib.h>
#include "ClassNgram.h"
#include "Trellis.cc"
#include "LHash.cc"
#include "Array.cc"
#include "Map2.cc"
#include "NgramStats.cc"
#define DEBUG_ESTIMATE_WARNINGS 1 /* from Ngram.cc */
#define DEBUG_PRINT_WORD_PROBS 2 /* from LM.cc */
#define DEBUG_NGRAM_HITS 2 /* from Ngram.cc */
#define DEBUG_TRANSITIONS 4
#define DEBUG_ESTIMATES 4 /* from Ngram.cc */
/*
* We use pairs of strings over VocabIndex (type ClassNgramState)
* as keys into the trellis. Define the necessary support functions.
*/
static inline unsigned
LHash_hashKey(const ClassNgramState &key, unsigned maxBits)
{
return (LHash_hashKey(key.classContext, maxBits) +
(key.classExpansion == 0 ? 0
: LHash_hashKey(key.classExpansion, maxBits)))
& hashMask(maxBits);
}
static inline ClassNgramState
Map_copyKey(const ClassNgramState &key)
{
ClassNgramState copy;
/*
* We need to copy the class context since it is created dynamically,
* but not the class expansion strings, since they reside statically in
* the LM.
*/
copy.classContext = Map_copyKey(key.classContext);
copy.classExpansion = key.classExpansion;
return copy;
}
static inline void
#if __GNUC__ == 2 && __GNUC_MINOR__ <= 7
Map_freeKey(ClassNgramState key)
#else // gcc 2.7 doesn't match f(&T) with f(T) here
Map_freeKey(ClassNgramState &key) // gcc 2.7 has problem matching this
#endif
{
Map_freeKey(key.classContext);
}
static inline Boolean
#if __GNUC__ == 2 && __GNUC_MINOR__ <= 7
LHash_equalKey(const ClassNgramState key1, const ClassNgramState key2)
#else // gcc 2.7 doesn't match f(&T,&T) with f(T,T) here
LHash_equalKey(const ClassNgramState &key1, const ClassNgramState &key2)
#endif
{
/*
* Note class expansions point into a shared structure, so can be
* compared as pointers.
*/
return LHash_equalKey(key1.classContext, key2.classContext) &&
key1.classExpansion == key2.classExpansion;
}
static inline Boolean
Map_noKeyP(const ClassNgramState &key)
{
return (key.classContext == 0);
}
static inline void
Map_noKey(ClassNgramState &key)
{
key.classContext = 0;
}
/*
* output operator
*/
ostream &
operator<< (ostream &stream, const ClassNgramState &state)
{
stream << "<" << state.classContext << ",";
if (state.classExpansion != 0 && state.classExpansion[0] != Vocab_None) {
stream << state.classExpansion;
} else {
stream << "NULL";
}
stream << ">";
return stream;
}
/*
* LM code
*/
ClassNgram::ClassNgram(Vocab &vocab, SubVocab &classVocab, unsigned order)
: Ngram(vocab, order), classVocab(classVocab),
trellis(maxWordsPerLine + 2 + 1, 0), savedLength(0), simpleNgram(false)
{
/*
* Make sure the classes are subset of base vocabulary
*/
assert(&vocab == &classVocab.baseVocab());
}
ClassNgram::~ClassNgram()
{
clearClasses();
}
void *
ClassNgram::contextID(VocabIndex word, const VocabIndex *context,
unsigned &length)
{
if (simpleNgram) {
return Ngram::contextID(word, context, length);
} else {
/*
* Due to the DP algorithm, we alway use the full context
* (don't inherit Ngram::contextID()).
*/
return LM::contextID(word, context, length);
}
}
LogP
ClassNgram::contextBOW(const VocabIndex *context, unsigned length)
{
if (simpleNgram) {
return Ngram::contextBOW(context, length);
} else {
/*
* Due to the DP algorithm, we alway use the full context
* (don't inherit Ngram::contextBOW()).
*/
return LM::contextBOW(context, length);
}
}
Boolean
ClassNgram::isNonWord(VocabIndex word)
{
/*
* classes are not words: duh!
*/
return Ngram::isNonWord(word) ||
!simpleNgram && classVocab.getWord(word) != 0;
}
/*
* Compute the prefix probability of word string (in reversed order)
* taking hidden events into account.
* This is done by dynamic programming, filling in the trellis[]
* array from left to right.
* Entry trellis[i][state].prob is the probability that of the first
* i words while being in state.
* The total prefix probability is the column sum in trellis[],
* summing over all states.
* For efficiency reasons, we specify the last word separately.
* If context == 0, reuse the results from the previous call.
*/
LogP
ClassNgram::prefixProb(VocabIndex word, const VocabIndex *context,
LogP &contextProb, TextStats &stats)
{
/*
* pos points to the column currently being computed (we skip over the
* initial <s> token)
* prefix points to the tail of context that is used for conditioning
* the current word.
*/
unsigned pos;
int prefix;
Boolean wasRunning = running(false);
if (context == 0) {
/*
* Reset the computation to the last iteration in the loop below
*/
pos = prevPos;
prefix = 0;
context = prevContext;
trellis.init(pos);
} else {
unsigned len = Vocab::length(context);
assert(len <= maxWordsPerLine);
/*
* Save these for possible recomputation with different
* word argument, using same context
*/
prevContext = context;
prevPos = 0;
/*
* Initialization: The 0 column corresponds to the <s> prefix.
*/
VocabIndex initialContext[2];
if (len > 0 && context[len - 1] == vocab.ssIndex()) {
initialContext[0] = context[len - 1];
initialContext[1] = Vocab_None;
prefix = len - 1;
} else {
initialContext[0] = Vocab_None;
prefix = len;
}
ClassNgramState initialState;
initialState.classContext = initialContext;
initialState.classExpansion = 0;
/*
* Start the DP from scratch if the context has less than two items
* (including <s>). This prevents the trellis from accumulating states
* over multiple sentences (which computes the right thing, but
* slowly).
*/
if (len > 1 &&
savedLength > 0 && savedContext[0] == initialContext[0])
{
/*
* Skip to the position where the saved
* context diverges from the new context.
*/
for (pos = 1;
pos < savedLength && prefix > 0 &&
context[prefix - 1] == savedContext[pos];
pos ++, prefix --)
{
prevPos = pos;
}
savedLength = pos;
trellis.init(pos);
} else {
/*
* Start a DP from scratch
*/
trellis.clear();
trellis.setProb(initialState, LogP_One);
trellis.step();
savedContext[0] = initialContext[0];
savedLength = 1;
pos = 1;
}
}
LogP logSum = LogP_Zero;
for ( ; prefix >= 0; pos++, prefix--) {
/*
* Keep track of the fact that at least one state has positive
* probability, needed for handling of OOVs below.
*/
Boolean havePosProb = false;
VocabIndex currWord;
if (prefix == 0) {
currWord = word;
} else {
currWord = context[prefix - 1];
/*
* Cache the context words for later shortcut processing
*/
savedContext[savedLength ++] = currWord;
}
/*
* Put underlying Ngram in "running" state (for debugging etc.)
* only when processing the last (current) word to prevent
* redundant output.
*/
if (prefix == 0) {
running(wasRunning);
}
/*
* Iterate over all states for the previous position in trellis
*/
TrellisIter<ClassNgramState> prevIter(trellis, pos - 1);
ClassNgramState prevState;
LogP prevProb;
while (prevIter.next(prevState, prevProb)) {
ClassNgramState nextState;
VocabIndex newContext[maxWordsPerLine + 1];
/*
* There are two ways to extend a previous state:
* - if the state is an incomplete class expansion, then the
* current word has to match the next word in the expansion,
* and the follow-state consists of the same class context and
* the current expansion with the current word removed.
* - if the state is a complete class context, then the class
* context is extended either with the current word, or by
* starting a class expansion that matches the current word.
*/
if (prevState.classExpansion != 0 &&
prevState.classExpansion[0] != Vocab_None)
{
/*
* Skip expansions that don't match
*/
if (prevState.classExpansion[0] != currWord) {
continue;
}
nextState.classContext = prevState.classContext;
nextState.classExpansion = prevState.classExpansion + 1;
if (debug(DEBUG_TRANSITIONS)) {
cerr << "POSITION = " << pos
<< " FROM: " << (vocab.use(), prevState)
<< " TO: " << (vocab.use(), nextState)
<< " WORD = " << vocab.getWord(currWord)
<< endl;
}
havePosProb = true;
/*
* For efficiency reasons we don't update the trellis
* when at the final word. In that case we just record
* the total probability.
*/
if (prefix > 0) {
trellis.update(prevState, nextState, LogP_One);
} else {
logSum = (LogP)AddLogP(logSum, prevProb);
}
} else {
/*
* Set up the extended context.
*/
unsigned i;
for (i = 0; i < maxWordsPerLine &&
prevState.classContext[i] != Vocab_None; i ++)
{
newContext[i + 1] = prevState.classContext[i];
}
newContext[i + 1] = Vocab_None;
nextState.classContext = newContext;
nextState.classExpansion = 0;
/*
* Extend context with current word
*/
newContext[0] = currWord;
/*
* Transition prob out of previous context to current word:
*/
LogP wordProb =
Ngram::wordProb(currWord, prevState.classContext);
if (wordProb != LogP_Zero) {
havePosProb = true;
}
/*
* Truncate context to what is actually used by LM.
*/
unsigned usedLength;
Ngram::contextID(Vocab_None, newContext, usedLength);
VocabIndex truncatedContextWord = newContext[usedLength];
newContext[usedLength] = Vocab_None;
if (debug(DEBUG_TRANSITIONS)) {
cerr << "POSITION = " << pos
<< " FROM: " << (vocab.use(), prevState)
<< " TO: " << (vocab.use(), nextState)
<< " WORD = " << vocab.getWord(currWord)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -