📄 ngramlm.cc
字号:
file.position() << "unexpected input\n";
return false;
}
default: /* reading n-grams, where n == state */
if (backslash) {
if (numOOVs > 0) {
if (debug(DEBUG_READ_STATS)) {
dout() << "discarded " << numOOVs
<< " OOV " << state << "-grams\n";
}
numOOVs = 0;
}
}
if (backslash && sscanf(line, "\\%d-grams", &state) == 1) {
if (state < 1 || state > (int)maxOrder) {
file.position() << "invalid ngram order " << state << "\n";
return false;
}
if (debug(DEBUG_READ_STATS)) {
dout() << (state <= (int)order ? "reading " : "skipping ")
<< numNgrams[state] << " "
<< state << "-grams\n";
}
/*
* start reading more n-grams
*/
continue;
} else if (backslash && strncmp(line, "\\end\\", 5) == 0) {
/*
* Check that the total number of ngrams read matches
* that found in the header
*/
for (int i = 0; i <= (int)maxOrder && i <= (int)order; i++) {
if (numNgrams[i] != numRead[i]) {
file.position() << "warning: " << numRead[i] << " "
<< i << "-grams read, expected "
<< numNgrams[i] << "\n";
}
}
return true;
} else if (state > (int)order) {
/*
* Save time and memory by skipping ngrams outside
* the order range of this model
*/
continue;
} else {
VocabString words[1+ maxNgramOrder + 1 + 1];
/* result of parsing an n-gram line
* the first and last elements are actually
* numerical parameters, but so what? */
VocabIndex wids[maxNgramOrder + 1];
/* ngram translated to word indices */
LogP prob, bow; /* probability and back-off-weight */
/*
* Parse a line of the form
* <prob> <w1> <w2> ... <bow>
*/
unsigned howmany = Vocab::parseWords(line, words, state + 3);
if ((int)howmany < state + 1 || (int)howmany > state + 2) {
file.position() << "ngram line has " << howmany
<< " fields (" << state + 2
<< " expected)\n";
return false;
}
/*
* Parse prob
*/
if (!parseLogP(words[0], prob)) {
file.position() << "bad prob \"" << words[0] << "\"\n";
return false;
} else if (prob > LogP_One || prob != prob) {
file.position() << "warning: questionable prob \""
<< words[0] << "\"\n";
} else if (prob == LogP_PseudoZero) {
/*
* convert pseudo-zeros back into real zeros
*/
prob = LogP_Zero;
}
/*
* Parse bow, if any
*/
if (howmany == state + 2) {
/*
* Parsing floats strings is the most time-consuming
* part of reading in backoff models. We therefore
* try to avoid parsing bows where they are useless,
* i.e., for contexts that are longer than what this
* model uses. We also do a quick sanity check to
* warn about non-zero bows in that position.
*/
if (state == maxOrder) {
if (words[state + 1][0] != '0') {
file.position() << "ignoring non-zero bow \""
<< words[state + 1]
<< "\" for maximal ngram\n";
}
} else if (state == order) {
/*
* save time and memory by skipping bows that will
* never be used as a result of higher-order ngram
* skipping
*/
;
} else if (!parseLogP(words[state + 1], bow)) {
file.position() << "bad bow \"" << words[state + 1]
<< "\"\n";
return false;
} else if (bow == LogP_Inf || bow != bow) {
file.position() << "warning: questionable bow \""
<< words[state + 1] << "\"\n";
} else if (bow == LogP_PseudoZero) {
/*
* convert pseudo-zeros back into real zeros
*/
bow = LogP_Zero;
}
}
numRead[state] ++;
/*
* Terminate the words array after the last word,
* then translate it to word indices. We also
* reverse the ngram since that's how we'll need it
* to index the trie.
*/
words[state + 1] = 0;
if (limitVocab) {
/*
* Skip N-gram if it contains OOV word.
* Do NOT add new words to the vocabulary.
*/
vocab.getIndices(&words[1], wids, maxNgramOrder);
Boolean haveOOV = false;
for (unsigned j = 0; j < (unsigned)state; j ++) {
if (wids[j] == Vocab_None) {
haveOOV = true;
break;
}
}
if (haveOOV) {
numOOVs ++;
/* skip rest of processing and go to next ngram */
continue;
}
} else {
vocab.addWords(&words[1], wids, maxNgramOrder);
}
Vocab::reverse(wids);
/*
* Store bow, if any
*/
if ((int)howmany == state + 2 && state < (int)order) {
*insertBOW(wids) = bow;
}
/*
* Save the last word (which is now the first, due to reversal)
* then use the first n-1 to index into
* the context trie, storing the prob.
*/
BOnode *bonode = contexts.find(&wids[1]);
if (!bonode) {
file.position() << "warning: no bow for prefix of ngram \""
<< &words[1] << "\"\n";
} else {
if (!warnedAboutUnk &&
wids[0] == vocab.unkIndex() &&
prob != LogP_Zero &&
!vocab.unkIsWord())
{
file.position() << "warning: non-zero probability for "
<< vocab.getWord(vocab.unkIndex())
<< " in closed-vocabulary LM\n";
warnedAboutUnk = true;
}
/* efficient for: *insertProb(wids[0], &wids[1]) = prob */
*bonode->probs.insert(wids[0]) = prob;
}
/*
* Hey, we're done with this ngram!
*/
}
}
}
/*
* we reached a premature EOF
*/
file.position() << "reached EOF before \\end\\\n";
return false;
}
void
Ngram::writeWithOrder(File &file, unsigned order)
{
unsigned i;
unsigned howmanyNgrams[maxNgramOrder + 1];
VocabIndex context[maxNgramOrder + 2];
VocabString scontext[maxNgramOrder + 1];
if (order > maxNgramOrder) {
order = maxNgramOrder;
}
fprintf(file, "\n\\data\\\n");
for (i = 1; i <= order; i++ ) {
howmanyNgrams[i] = numNgrams(i);
fprintf(file, "ngram %d=%d\n", i, howmanyNgrams[i]);
}
for (i = 1; i <= order; i++ ) {
fprintf(file, "\n\\%d-grams:\n", i);
if (debug(DEBUG_WRITE_STATS)) {
dout() << "writing " << howmanyNgrams[i] << " "
<< i << "-grams\n";
}
NgramBOsIter iter(*this, context + 1, i - 1, vocab.compareIndex());
BOnode *node;
while (node = iter.next()) {
vocab.getWords(context + 1, scontext, maxNgramOrder + 1);
Vocab::reverse(scontext);
NgramProbsIter piter(*node, vocab.compareIndex());
VocabIndex pword;
LogP *prob;
while (prob = piter.next(pword)) {
if (file.error()) {
return;
}
fprintf(file, "%.*lg\t", LogP_Precision,
(double)(*prob == LogP_Zero ?
LogP_PseudoZero : *prob));
Vocab::write(file, scontext);
fprintf(file, "%s%s", (i > 1 ? " " : ""), vocab.getWord(pword));
if (i < order) {
context[0] = pword;
LogP *bow = findBOW(context);
if (bow) {
fprintf(file, "\t%.*lg", LogP_Precision,
(double)(*bow == LogP_Zero ?
LogP_PseudoZero : *bow));
}
/*Add by zgchen 2006-7-27*/
else
{
fprintf(file, "\t%.*lg", LogP_Precision,(double)(0));
}
/*Add by zgchen 2006-7-27*/
}
fprintf(file, "\n");
}
}
}
fprintf(file, "\n\\end\\\n");
}
unsigned int
Ngram::numNgrams(unsigned int order)
{
if (order < 1) {
return 0;
} else {
unsigned int howmany = 0;
makeArray(VocabIndex, context, order + 1);
NgramBOsIter iter(*this, context, order - 1);
BOnode *node;
while (node = iter.next()) {
howmany += node->probs.numEntries();
}
return howmany;
}
}
/*
* Estimation
*/
Boolean
Ngram::estimate(NgramStats &stats, unsigned *mincounts, unsigned *maxcounts)
{
/*
* If no discount method was specified we do the default, standard
* thing. Good Turing discounting with the specified min and max counts
* for all orders.
*/
Discount **discounts = new Discount *[order];
assert(discounts != 0);
unsigned i;
Boolean error = false;
for (i = 1; !error && i <= order; i++) {
discounts[i-1] =
new GoodTuring(mincounts ? mincounts[i-1] : GT_defaultMinCount,
maxcounts ? maxcounts[i-1] : GT_defaultMaxCount);
/*
* Transfer the LMStats's debug level to the newly
* created discount objects
*/
discounts[i-1]->debugme(stats.debuglevel());
if (!discounts[i-1]->estimate(stats, i)) {
cerr << "failed to estimate GT discount for order " << i + 1
<< endl;
error = true;
} else if (debug(DEBUG_PRINT_GTPARAMS)) {
dout() << "Good Turing parameters for " << i << "-grams:\n";
File errfile(stderr);
discounts[i-1]->write(errfile);
}
}
if (!error) {
error = !estimate(stats, discounts);
}
for (i = 1; i <= order; i++) {
delete discounts[i-1];
}
delete [] discounts;
return !error;
}
/*
* Count number of vocabulary items that get probability mass
*/
unsigned
Ngram::vocabSize()
{
unsigned numWords = 0;
VocabIter viter(vocab);
VocabIndex word;
while (viter.next(word)) {
if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) {
numWords ++;
}
}
return numWords;
}
/*
* Generic version of estimate(NgramStats, Discount)
* and estimate(NgramCounts<FloatCount>, Discount)
*
* XXX: This function is essentially duplicated in other places.
* Propate changes to VarNgram::estimate().
*/
template <class CountType>
Boolean
Ngram::estimate2(NgramCounts<CountType> &stats, Discount **discounts)
{
/*
* For all ngrams, compute probabilities and apply the discount
* coefficients.
*/
makeArray(VocabIndex, context, order);
unsigned vocabSize = Ngram::vocabSize();
/*
* Remove all old contexts
*/
clear();
/*
* Ensure <s> unigram exists (being a non-event, it is not inserted
* in distributeProb(), yet is assumed by much other software).
*/
if (vocab.ssIndex() != Vocab_None) {
context[0] = Vocab_None;
*insertProb(vocab.ssIndex(), context) = LogP_Zero;
}
for (unsigned i = 1; i <= order; i++) {
unsigned noneventContexts = 0;
unsigned noneventNgrams = 0;
unsigned discountedNgrams = 0;
/*
* check if discounting is disabled for this round
*/
Boolean noDiscount =
(discounts == 0) ||
(discounts[i-1] == 0) ||
discounts[i-1]->nodiscount();
Boolean interpolate =
(discounts != 0) &&
(discounts[i-1] != 0) &&
discounts[i-1]->interpolate;
/*
* modify counts are required by discounting method
*/
if (!noDiscount && discounts && discounts[i-1]) {
discounts[i-1]->prepareCounts(stats, i, order);
}
/*
* This enumerates all contexts, i.e., i-1 grams.
*/
CountType *contextCount;
NgramCountsIter<CountType> contextIter(stats, context, i-1);
while (contextCount = contextIter.next()) {
/*
* Skip contexts ending in </s>. This typically only occurs
* with the doubling of </s> to generate trigrams from
* bigrams ending in </s>.
* If <unk> is not real word, also skip context that contain
* it.
*/
if (i > 1 && context[i-2] == vocab.seIndex() ||
vocab.isNonEvent(vocab.unkIndex()) &&
vocab.contains(context, vocab.unkIndex()))
{
noneventContexts ++;
continue;
}
VocabIndex word[2]; /* the follow word */
NgramCountsIter<CountType> followIter(stats, context, word, 1);
CountType *ngramCount;
/*
* Total up the counts for the denominator
* (the lower-order counts may not be consistent with
* the higher-order ones, so we can't just use *contextCount)
* Only if the trustTotal flag is set do we override this
* with the count from the context ngram.
*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -