📄 classngram.cc
字号:
<< " PROB = " << wordProb
<< endl;
}
/*
* For efficiency reasons we don't update the trellis
* when at the final word. In that case we just record
* the total probability.
*/
if (prefix > 0) {
trellis.update(prevState, nextState, wordProb);
} else {
logSum = (LogP)AddLogP(logSum, prevProb + wordProb);
}
/*
* Restore newContext
*/
newContext[usedLength] = truncatedContextWord;
/*
* Now extend context by all class expansions that can start
* with the current word
*/
Map2Iter2<VocabIndex,ClassExpansion,Prob>
expandIter(classDefsByWord, currWord);
Prob *expansionProb;
ClassExpansion classAndExpansion;
while (expansionProb = expandIter.next(classAndExpansion)) {
VocabIndex clasz = classAndExpansion[0];
/*
* Prepend new class to context
*/
newContext[0] = clasz;
/*
* Transition prob out of previous context to new class
*/
LogP classProb =
Ngram::wordProb(clasz, prevState.classContext);
/*
* Truncate context to what is actually used by LM
*/
unsigned usedLength;
Ngram::contextID(Vocab_None, newContext, usedLength);
VocabIndex truncatedContextWord = newContext[usedLength];
newContext[usedLength] = Vocab_None;
/*
* Discard the class itself and the first word,
* which is already consumed by the current position.
*/
nextState.classExpansion = classAndExpansion + 2;
if (debug(DEBUG_TRANSITIONS)) {
cerr << "POSITION = " << pos
<< " FROM: " << (vocab.use(), prevState)
<< " TO: " << (vocab.use(), nextState)
<< " WORD = " << vocab.getWord(currWord)
<< " PROB = " << classProb
<< " EXPANDPROB = " << *expansionProb
<< endl;
}
if (classProb != LogP_Zero && *expansionProb != 0.0) {
havePosProb = true;
}
/*
* For efficiency reasons we don't update the trellis
* when at the final word. In that case we just record
* the total probability.
*/
if (prefix > 0) {
trellis.update(prevState, nextState,
classProb + ProbToLogP(*expansionProb));
} else {
logSum = (LogP)AddLogP(logSum, prevProb +
classProb + ProbToLogP(*expansionProb));
}
/*
* Restore newContext
*/
newContext[usedLength] = truncatedContextWord;
}
}
}
/*
* Set noevent state probability to the previous total prefix
* probability if the current word had probability zero in all
* states, and we are not yet at the end of the prefix.
* This allows us to compute conditional probs based on
* truncated contexts, and to compute the total sentence probability
* leaving out the OOVs, as required by sentenceProb().
* We include the words in the state so that context cues (e.g., <s>)
* can still be used down the line.
*/
if (prefix > 0 && !havePosProb) {
ClassNgramState newState;
newState.classContext = &context[prefix - 1];
newState.classExpansion = 0;
trellis.init(pos);
trellis.setProb(newState, trellis.sumLogP(pos - 1));
if (currWord == vocab.unkIndex()) {
stats.numOOVs ++;
} else {
stats.zeroProbs ++;
}
}
trellis.step();
prevPos = pos;
}
if (prevPos > 0) {
contextProb = trellis.sumLogP(prevPos - 1);
} else {
contextProb = LogP_One;
}
return logSum;
}
/*
* The conditional word probability is computed as
* p(w1 .... wk)/p(w1 ... w(k-1)
*/
LogP
ClassNgram::wordProb(VocabIndex word, const VocabIndex *context)
{
if (simpleNgram) {
return Ngram::wordProb(word, context);
} else {
LogP cProb;
TextStats stats;
LogP pProb = prefixProb(word, context, cProb, stats);
return pProb - cProb;
}
}
LogP
ClassNgram::wordProbRecompute(VocabIndex word, const VocabIndex *context)
{
if (simpleNgram) {
return Ngram::wordProbRecompute(word, context);
} else {
LogP cProb;
TextStats stats;
LogP pProb = prefixProb(word, 0, cProb, stats);
return pProb - cProb;
}
}
/*
* Sentence probabilities from indices
* This version computes the result directly using prefixProb to
* avoid recomputing prefix probs for each prefix.
*/
LogP
ClassNgram::sentenceProb(const VocabIndex *sentence, TextStats &stats)
{
/*
* The debugging machinery is not duplicated here, so just fall back
* on the general code for that.
*/
if (simpleNgram || debug(DEBUG_PRINT_WORD_PROBS)) {
return Ngram::sentenceProb(sentence, stats);
} else {
unsigned int len = vocab.length(sentence);
LogP totalProb;
makeArray(VocabIndex, reversed, len + 2 + 1);
/*
* Contexts are represented most-recent-word-first.
* Also, we have to prepend the sentence-begin token,
* and append the sentence-end token.
*/
len = prepareSentence(sentence, reversed, len);
/*
* Invalidate cache (for efficiency only)
*/
savedLength = 0;
LogP contextProb;
totalProb = prefixProb(reversed[0], reversed + 1, contextProb, stats);
/*
* OOVs and zeroProbs are updated by prefixProb()
*/
stats.numSentences ++;
stats.prob += totalProb;
stats.numWords += len;
return totalProb;
}
}
Boolean
ClassNgram::read(File &file, Boolean limitVocab)
{
/*
* First read the ngram data in standard format
*/
if (!Ngram::read(file, limitVocab)) {
return false;
}
/*
* Now read class definitions
*/
return readClasses(file);
}
void
ClassNgram::write(File &file)
{
/*
* First write out the Ngram parameters in the usual format
*/
Ngram::write(file);
fprintf(file, "\n");
/*
* Now write the class definitions
*/
writeClasses(file);
fprintf(file, "\n");
}
void
ClassNgram::clearClasses()
{
/*
* Remove all class definitions
*/
classDefs.clear();
classDefsByWord.clear();
}
Boolean
ClassNgram::readClasses(File &file)
{
char *line;
Boolean classesCleared = false;
while (line = file.getline()) {
VocabString words[maxWordsPerLine];
/*
* clear old class definitions only when encountering first new
* class definition
*/
if (!classesCleared) {
clearClasses();
classesCleared = true;
}
unsigned howmany = Vocab::parseWords(line, words, maxWordsPerLine);
if (howmany == maxWordsPerLine) {
file.position() << "class definition has too many fields\n";
return false;
}
/*
* First word contains class name
*/
VocabIndex clasz = classVocab.addWord(words[0]);
double prob = 1.0;
VocabString *expansionWords;
/*
* If second word is numeral, assume it's the class expansion prob
*/
if (howmany > 1 && sscanf(words[1], "%lf", &prob)) {
expansionWords = &words[2];
} else {
expansionWords = &words[1];
}
/*
* Add expansion words to vocabulary and store.
* The first position in the string is reserved for the class itself
* (for use in classDefsByWord).
*/
VocabIndex classAndExpansion[maxWordsPerLine + 1];
classAndExpansion[0] = clasz;
if (vocab.addWords(expansionWords, &classAndExpansion[1],
maxWordsPerLine) == 0)
{
file.position() << "class expansion contains no words\n";
return false;
}
*classDefs.insert(clasz, &classAndExpansion[1]) = prob;
/*
* Index the class and its expansion by the first word.
*/
*classDefsByWord.insert(classAndExpansion[1], classAndExpansion) = prob;
}
return true;
}
void
ClassNgram::writeClasses(File &file)
{
VocabIndex clasz;
Map2Iter<VocabIndex, ClassExpansion, Prob>
classIter(classDefs, vocab.compareIndex());
while (classIter.next(clasz)) {
Map2Iter2<VocabIndex, ClassExpansion, Prob>
iter(classDefs, clasz, vocab.compareIndices());
ClassExpansion expansion;
Prob *prob;
while (prob = iter.next(expansion)) {
fprintf(file, "%s %lf", vocab.getWord(clasz), *prob);
for (unsigned i = 0; expansion[i] != Vocab_None; i ++) {
fprintf(file, " %s", vocab.getWord(expansion[i]));
}
fprintf(file, "\n");
}
}
fprintf(file, "\n");
}
/*
* Compile class-ngram into word-ngram model by expanding classes
* Algorithm:
* 1 - Compute joint probabilities for expanded word-ngrams
* 2 - Compute conditional word-ngram probabilities from joint probs.
* 3 - Compute backoff weights
* The second argument gives the length of expanded ngrams whose
* conditional probability should be computed using the forward algorithm.
* This is more expensive but gives better results for ngrams much longer
* than those contained in the original model.
*/
Ngram *
ClassNgram::expand(unsigned newOrder, unsigned expandExact)
{
NgramCounts<LogP> ngramProbs(vocab, maxWordsPerLine);
// accumulators for joint ngram probs
unsigned maxNgramLength = 0; // to determine final ngram order
VocabIndex wordNgram[maxWordsPerLine];
wordNgram[0] = Vocab_None;
makeArray(VocabIndex, context, order + 2);
/*
* Turn off the DP for the computation of joint context probabilities
*/
simpleNgram = true;
unsigned i;
for (i = 0 ; i < order; i ++) {
BOnode *node;
NgramBOsIter iter(*this, context, i);
while (node = iter.next()) {
LogP jointContextProb = contextProb(context);
/*
* Flip context to give regular ngram order and allow appending
* final word.
*/
Vocab::reverse(context);
/*
* Enumerate all follow ngrams
*/
NgramProbsIter piter(*node);
VocabIndex clasz;
LogP *ngramProb;
while (ngramProb = piter.next(clasz)) {
context[i] = clasz;
context[i+1] = Vocab_None;
/*
* Expand the full ngram.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -