📄 classngram.cc
字号:
*/
ClassNgramExpandIter expandIter(*this, context, wordNgram);
LogP expandProb;
unsigned firstLen, lastLen;
while (expandIter.next(expandProb, firstLen, lastLen)) {
unsigned expandedLen = Vocab::length(wordNgram);
if (expandedLen > maxNgramLength) {
maxNgramLength = expandedLen;
}
LogP newProb = jointContextProb + *ngramProb + expandProb;
/*
* Increment the total joint probability for all
* ngrams resulting from the last class expansion.
* (Shorter prefixes of the expansion are taken care
* of automatically as a result of expanding prefixes
* of the context.)
*/
for (unsigned j = 0; j < lastLen; j ++) {
for (unsigned k = 0; k < firstLen; k ++) {
/*
* Truncate the N-gram starting from the back
*/
wordNgram[expandedLen - j] = Vocab_None;
LogP *oldProb =
ngramProbs.insertCount(&wordNgram[k]);
if (*oldProb == 0.0) {
*oldProb = newProb;
} else {
*oldProb = (LogP)AddLogP(*oldProb, newProb);
}
}
}
}
}
context[i] = Vocab_None;
Vocab::reverse(context);
}
}
/*
* Unless requested otherwise, include all ngrams in the new model
*/
if (newOrder == 0) {
newOrder = maxNgramLength;
} else if (newOrder > maxNgramLength) {
newOrder = maxNgramLength;
}
/*
* Default is to not use exact expansion at all
*/
if (expandExact == 0) {
expandExact = newOrder + 1;
}
/*
* Compute joint probs for word ngrams that were expanded in the
* above step, but are not contained in the original model.
*/
for (i = 1; i < expandExact; i++) {
LogP *oldProb;
NgramCountsIter<LogP> ngramIter(ngramProbs, wordNgram, i);
/*
* This enumerates all i-grams.
*/
while (oldProb = ngramIter.next()) {
/*
* destructively extract context portion of ngram
*/
Vocab::reverse(wordNgram);
if (findProb(wordNgram[0], &wordNgram[1]) == 0) {
/*
* ngram is not in old model:
* compute joint probability for this ngram, excluding classes
*/
LogP newProb = contextProb(wordNgram);
if (*oldProb == 0.0) {
*oldProb = newProb;
} else {
*oldProb = (LogP)AddLogP(*oldProb, newProb);
}
}
Vocab::reverse(wordNgram);
}
}
simpleNgram = false;
/*
* Copy all regular (non-class) words to the new vocabulary,
* including special tokens.
*/
SubVocab *newVocab = new SubVocab(vocab);
assert(newVocab);
VocabIter viter(vocab);
VocabIndex wordIndex;
VocabString wordString;
while (wordString = viter.next(wordIndex)) {
if (!classVocab.getWord(wordIndex)) {
newVocab->addWord(wordString);
} else {
/*
* ensure all words in the class expansion are in the new vocab:
* this includes classes that occur in expansions of other classes,
* even though we currently don't support "context-free" rules
*/
Map2Iter2<VocabIndex, ClassExpansion, Prob>
iter(classDefs, wordIndex);
ClassExpansion expansion;
while (iter.next(expansion)) {
for (i = 0; expansion[i] != Vocab_None; i ++) {
VocabString className = classVocab.getWord(expansion[i]);
if (className) {
cerr << "warning: expansion of " << wordString
<< " -> " << (vocab.use(), expansion)
<< " refers to another class: " << className
<< endl;
newVocab->addWord(className);
}
}
}
}
}
/*
* Duplicate special word indices in new vocab
*/
newVocab->unkIndex() = vocab.unkIndex();
newVocab->ssIndex() = vocab.ssIndex();
newVocab->seIndex() = vocab.seIndex();
newVocab->pauseIndex() = vocab.pauseIndex();
newVocab->addNonEvent(vocab.ssIndex());
newVocab->addNonEvent(vocab.pauseIndex());
/*
* Create new ngram model (inherit debug level from class ngram)
*/
Ngram *ngram = new Ngram(*newVocab, newOrder);
assert(ngram != 0);
ngram->debugme(debuglevel());
/*
* For all ngrams, compute probabilities
*/
for (i = 0; i < newOrder; i++) {
LogP *contextProb;
NgramCountsIter<LogP> contextIter(ngramProbs, wordNgram, i);
/*
* This enumerates all contexts, i.e., i-grams.
*/
while (contextProb = contextIter.next()) {
/*
* The probability of <s> is -Inf in the model, but it should be
* P(</s>) for purposes of normalization when computing the
* conditional probs below (consistent with LM::contextProb()).
*/
if (i == 1 && wordNgram[0] == vocab.ssIndex()) {
VocabIndex emptyContext = Vocab_None;
*contextProb = wordProb(vocab.seIndex(), &emptyContext);
}
VocabIndex word[2]; /* the follow word */
NgramCountsIter<LogP> followIter(ngramProbs, wordNgram, word, 1);
LogP *ngramProb;
/*
* reverse context words in preparation for ngram prob insertion
*/
Vocab::reverse(wordNgram);
if (i + 1 >= expandExact) {
/*
* Exact conditional probability for expanded ngram:
* Run the forward algorithm (by way of wordProb).
*/
while (ngramProb = followIter.next()) {
LogP lprob = wordProb(word[0], wordNgram);
if (lprob > LogP_One) {
if (LogPtoProb(lprob) - 1.0 > Prob_Epsilon) {
cerr << "bad conditional prob for \""
<< (vocab.use(), wordNgram) << "\": "
<< LogPtoProb(lprob) << " > 1\n";
}
lprob = LogP_One;
}
if (debug(DEBUG_ESTIMATES)) {
dout() << "CONTEXT " << (vocab.use(), wordNgram)
<< " WORD " << vocab.getWord(word[0])
<< " EXACT LPROB " << lprob
<< endl;
}
*ngram->insertProb(word[0], wordNgram) = lprob;
}
} else {
/*
* Compute sum of all ngram probs
*/
LogP probSum = LogP_Zero;
while (ngramProb = followIter.next()) {
probSum = (LogP)AddLogP(probSum, *ngramProb);
}
/*
* Compute the sum of ngram probs
* - because it needs to be computed for the empty context
* - to check for abnormal conditions
*/
if (i == 0 || probSum > *contextProb) {
if (i > 0 &&
LogPtoProb(probSum) - LogPtoProb(*contextProb)
> Prob_Epsilon &&
debug(DEBUG_ESTIMATE_WARNINGS))
{
cerr << "warning: prob for context \""
<< (vocab.use(), wordNgram)
<< "\" lower than total ngram prob for words "
<< "(" << *contextProb << " < " << probSum << ")"
<< endl;
}
*contextProb = probSum;
}
/*
* Enumerate all words that can follow this context
*/
followIter.init();
while (ngramProb = followIter.next()) {
LogP lprob = *ngramProb - *contextProb;
if (debug(DEBUG_ESTIMATES)) {
dout() << "CONTEXT " << (vocab.use(), wordNgram)
<< " WORD " << vocab.getWord(word[0])
<< " NUMER " << *ngramProb
<< " DENOM " << *contextProb
<< " LPROB " << lprob
<< endl;
}
*ngram->insertProb(word[0], wordNgram) = lprob;
}
}
Vocab::reverse(wordNgram);
}
}
/*
* Complete new model estimation by filling in backoff weights
*/
ngram->recomputeBOWs();
return ngram;
}
/*
* Enumerate all class expansions in a mixed word/class token string
*/
ClassNgramExpandIter::ClassNgramExpandIter(ClassNgram &ngram,
const VocabIndex *classes, VocabIndex *buffer)
: ngram(ngram), classes(classes), buffer(buffer),
expandIter(0), subIter(0), done(false)
{
/*
* Find the first class token in classes string
*/
for (firstClassPos = 0;
classes[firstClassPos] != Vocab_None;
firstClassPos++)
{
if (ngram.classVocab.getWord(classes[firstClassPos]) != 0) {
break;
}
}
/*
* If there is a class token, set up the iterator over its expansions
*/
if (classes[firstClassPos] != Vocab_None) {
expandIter = new Map2Iter2<VocabIndex,ClassExpansion,Prob>
(ngram.classDefs, classes[firstClassPos]);
assert(expandIter != 0);
}
/*
* Copy the words preceding the first class into the buffer
*/
for (unsigned i = 0; i < (unsigned)firstClassPos; i ++) {
buffer[i] = classes[i];
}
buffer[firstClassPos] = Vocab_None;
}
ClassNgramExpandIter::~ClassNgramExpandIter()
{
delete expandIter;
delete subIter;
}
/*
* Return the next class-expanded word string.
* Also return the aggregate probability of all expansions in the current
* string (prob), the expanded length of the first input token (firstLen),
* and the expanded length of the last input token (lastLen).
*/
VocabIndex *
ClassNgramExpandIter::next(LogP &prob, unsigned &firstLen, unsigned &lastLen)
{
if (done) {
return 0;
} else if (expandIter == 0) {
/*
* If the class iterator is not active, we have an all-words string
* and just return it.
*/
done = true;
prob = LogP_One;
firstLen = lastLen = (classes[0] == Vocab_None ? 0 : 1);
return buffer;
} else {
while (1) {
if (subIter == 0) {
/*
* The sub-iteration is done, advance to the next
* expansion of the first class.
*/
ClassExpansion expansion;
Prob *expandProb = expandIter->next(expansion);
if (expandProb == 0) {
done = true;
return 0;
} else {
/*
* Remember across invocations
*/
prob1 = ProbToLogP(*expandProb);
/*
* Append expansion to buffer, and record its length
*/
for (firstClassLen = 0;
expansion[firstClassLen] != Vocab_None;
firstClassLen ++)
{
buffer[firstClassPos + firstClassLen] =
expansion[firstClassLen];
}
/*
* Create recursive iterator to expand the
* remaining string
*/
subIter = new ClassNgramExpandIter(ngram,
&classes[firstClassPos + 1],
&buffer[firstClassPos + firstClassLen]);
assert(subIter != 0);
}
}
LogP subProb;
unsigned subFirstLen, subLastLen;
if (subIter->next(subProb, subFirstLen, subLastLen) == 0) {
/*
* expansion of rest string exhausted
* -- continue expanding first class
*/
delete subIter;
subIter = 0;
} else {
prob = prob1 + subProb;
firstLen = (firstClassPos == 0 ? firstClassLen : 1);
lastLen = (classes[firstClassPos + 1] == Vocab_None ?
firstClassLen : subLastLen);
return buffer;
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -