📄 ngram.cc
字号:
(hiddenVocabFile != 0) ? new HiddenNgram(*vocab, *hiddenEvents, order, hiddenNot) :
(classVocab != 0) ?
(simpleClasses ?
new SimpleClassNgram(*vocab, *classVocab, order) :
new ClassNgram(*vocab, *classVocab, order)) :
new Ngram(*vocab, order);
assert(ngramLM != 0);
ngramLM->debugme(debug);
if (skipOOVs) {
ngramLM->skipOOVs() = true;
}
if (null) {
useLM = new NullLM(*vocab);
assert(useLM != 0);
} else if (lmFile) {
if (hmm) {
/*
* Read an HMM of Ngrams
*/
File file(lmFile, "r");
HMMofNgrams *hmm = new HMMofNgrams(*vocab, order);
hmm->debugme(debug);
if (!hmm->read(file, limitVocab)) {
cerr << "format error in lm file\n";
exit(1);
}
useLM = hmm;
} else if (adaptMix) {
/*
* Read an adaptive mixture of Ngrams
*/
File file(lmFile, "r");
AdaptiveMix *lm = new AdaptiveMix(*vocab, adaptDecay,
bayesScale, adaptIters);
lm->debugme(debug);
if (!lm->read(file, limitVocab)) {
cerr << "format error in lm file\n";
exit(1);
}
useLM = lm;
} else {
/*
* Read just a single LM
*/
File file(lmFile, "r");
if (!ngramLM->read(file, limitVocab)) {
cerr << "format error in lm file\n";
exit(1);
}
if (mixFile && !loglinearMix && bayesLength < 0) {
/*
* perform static interpolation (ngram merging)
*/
double mixLambda1 = 1.0 - mixLambda - mixLambda2 - mixLambda3 -
mixLambda4 - mixLambda5 - mixLambda6 -
mixLambda7 - mixLambda8 - mixLambda9;
ngramLM = (Ngram *)makeMixLM(mixFile, *vocab, classVocab,
order, ngramLM, mixLambda1,
mixLambda + mixLambda1);
if (mixFile2) {
ngramLM = (Ngram *)makeMixLM(mixFile2, *vocab, classVocab,
order, ngramLM, mixLambda2,
mixLambda + mixLambda1 + mixLambda2);
}
if (mixFile3) {
ngramLM = (Ngram *)makeMixLM(mixFile3, *vocab, classVocab,
order, ngramLM, mixLambda3,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3);
}
if (mixFile4) {
ngramLM = (Ngram *)makeMixLM(mixFile4, *vocab, classVocab,
order, ngramLM, mixLambda4,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4);
}
if (mixFile5) {
ngramLM = (Ngram *)makeMixLM(mixFile5, *vocab, classVocab,
order, ngramLM, mixLambda5,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5);
}
if (mixFile6) {
ngramLM = (Ngram *)makeMixLM(mixFile6, *vocab, classVocab,
order, ngramLM, mixLambda6,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6);
}
if (mixFile7) {
ngramLM = (Ngram *)makeMixLM(mixFile7, *vocab, classVocab,
order, ngramLM, mixLambda7,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6 + mixLambda7);
}
if (mixFile8) {
ngramLM = (Ngram *)makeMixLM(mixFile8, *vocab, classVocab,
order, ngramLM, mixLambda8,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6 + mixLambda7 + mixLambda8);
}
if (mixFile9) {
ngramLM = (Ngram *)makeMixLM(mixFile9, *vocab, classVocab,
order, ngramLM, mixLambda9, 1.0);
}
}
/*
* Renormalize before the optional steps below, in case input
* model needs it, and because class expansion and pruning already
* include normalization.
*/
if (renormalize) {
ngramLM->recomputeBOWs();
}
/*
* Read class definitions from command line AFTER the LM, so
* they can override embedded class definitions.
*/
if (classesFile != 0) {
File file(classesFile, "r");
((ClassNgram *)ngramLM)->readClasses(file);
}
if (expandClasses >= 0) {
/*
* Replace class ngram with equivalent word ngram
* expandClasses == 0 generates all ngrams
* expandClasses > 0 generates only ngrams up to given length
*/
Ngram *newLM =
((ClassNgram *)ngramLM)->expand(expandClasses, expandExact);
newLM->debugme(debug);
delete ngramLM;
ngramLM = newLM;
}
if (prune != 0.0) {
ngramLM->pruneProbs(prune, minprune);
}
if (pruneLowProbs) {
ngramLM->pruneLowProbs(minprune);
}
useLM = ngramLM;
}
} else {
cerr << "need at least an -lm file specified\n";
exit(1);
}
if (mixFile && (loglinearMix || bayesLength >= 0)) {
/*
* create a Bayes mixture LM
*/
double mixLambda1 = 1.0 - mixLambda - mixLambda2 - mixLambda3 -
mixLambda4 - mixLambda5 - mixLambda6 -
mixLambda7 - mixLambda8 - mixLambda9;
useLM = makeMixLM(mixFile, *vocab, classVocab, order, useLM,
mixLambda1,
mixLambda + mixLambda1);
if (mixFile2) {
useLM = makeMixLM(mixFile2, *vocab, classVocab, order, useLM,
mixLambda2,
mixLambda + mixLambda1 + mixLambda2);
}
if (mixFile3) {
useLM = makeMixLM(mixFile3, *vocab, classVocab, order, useLM,
mixLambda3,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3);
}
if (mixFile4) {
useLM = makeMixLM(mixFile4, *vocab, classVocab, order, useLM,
mixLambda4,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4);
}
if (mixFile5) {
useLM = makeMixLM(mixFile5, *vocab, classVocab, order, useLM,
mixLambda5,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5);
}
if (mixFile6) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda6,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6);
}
if (mixFile7) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda7,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6 + mixLambda7);
}
if (mixFile8) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda8,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6 + mixLambda7 + mixLambda8);
}
if (mixFile9) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda9, 1.0);
}
}
if (cache > 0) {
/*
* Create a mixture model with the cache lm as the second component
*/
CacheLM *cacheLM = new CacheLM(*vocab, cache);
assert(cacheLM != 0);
BayesMix *mixLM = new BayesMix(*vocab, *useLM, *cacheLM,
0, 1.0 - cacheLambda, 0.0);
assert(mixLM != 0);
useLM = mixLM;
useLM->debugme(debug);
}
if (dynamic) {
/*
* Create a mixture model with the dynamic lm as the second component
*/
DynamicLM *dynamicLM = new DynamicLM(*vocab);
assert(dynamicLM != 0);
BayesMix *mixLM = new BayesMix(*vocab, *useLM, *dynamicLM,
0, 1.0 - dynamicLambda, 0.0);
assert(mixLM != 0);
useLM = mixLM;
useLM->debugme(debug);
}
if (adaptMarginals != 0) {
/*
* Adapt base LM to adaptive marginals given by unigram LM
*/
Ngram *adaptMargLM = new Ngram(*vocab, 1);
assert(adaptMargLM != 0);
{
File file(adaptMarginals, "r");
adaptMargLM->debugme(debug);
adaptMargLM->read(file);
}
LM *baseMargLM;
if (baseMarginals == 0) {
baseMargLM = useLM;
} else {
baseMargLM = new Ngram(*vocab, 1);
assert(baseMargLM != 0);
File file(baseMarginals, "r");
baseMargLM->debugme(debug);
baseMargLM->read(file);
}
AdaptiveMarginals *adaptLM =
new AdaptiveMarginals(*vocab, *useLM, *baseMargLM,
*adaptMargLM, adaptMarginalsBeta);
if (adaptMarginalsRatios) {
adaptLM->computeRatios = true;
}
assert(adaptLM != 0);
useLM = adaptLM;
useLM->debugme(debug);
}
/*
* Reverse words in scoring
*/
if (reverseSents) {
useLM->reverseWords = true;
}
/*
* Skip noise tags in scoring
*/
if (noiseVocabFile) {
File file(noiseVocabFile, "r");
useLM->noiseVocab.read(file);
}
if (noiseTag) { /* backward compatibility */
useLM->noiseVocab.addWord(noiseTag);
}
if (memuse) {
MemStats memuse;
useLM->memStats(memuse);
memuse.print();
}
/*
* Apply multiword wrapper if requested
*/
if (multiwords) {
useLM = new MultiwordLM(*(MultiwordVocab *)vocab, *useLM);
assert(useLM != 0);
useLM->debugme(debug);
}
/*
* Rescore N-gram probs in LM file
*/
if (rescoreNgramFile) {
// create new vocab to avoid including class and multiwords
// from the rescoring LM
SubVocab *ngramVocab = new SubVocab(*vocab);
assert(ngramVocab != 0);
// read N-gram to be rescored
Ngram *rescoreLM = new Ngram(*ngramVocab, order);
assert(rescoreLM != 0);
rescoreLM->debugme(debug);
File file(rescoreNgramFile, "r");
rescoreLM->read(file);
rescoreLM->rescoreProbs(*useLM);
// free memory for LMs used in rescoring
if (ngramLM != useLM) {
delete ngramLM;
ngramLM = 0;
}
delete useLM;
// use rescored LM below
useLM = rescoreLM;
}
/*
* Compute perplexity on a text file, if requested
*/
if (pplFile) {
File file(pplFile, "r");
TextStats stats;
/*
* Send perplexity info to stdout
*/
useLM->dout(cout);
useLM->pplFile(file, stats, escape);
useLM->dout(cerr);
cout << "file " << pplFile << ": " << stats;
}
/*
* Compute perplexity on a count file, if requested
*/
if (countFile) {
TextStats stats;
File file(countFile, "r");
/*
* Send perplexity info to stdout
*/
useLM->dout(cout);
useLM->pplCountsFile(file, countOrder ? countOrder : order,
stats, escape, countEntropy);
useLM->dout(cerr);
cout << "file " << countFile << ": " << stats;
}
/*
* Rescore N-best list, if requested
*/
if (nbestFile) {
rescoreNbest(*useLM, nbestFile, NULL);
}
/*
* Rescore multiple N-best lists
*/
if (nbestFiles) {
File file(nbestFiles, "r");
char *line;
while (line = file.getline()) {
char *fname = strtok(line, wordSeparators);
if (!fname) continue;
RefString sentid = idFromFilename(fname);
if (writeNbestDir) {
makeArray(char, scoreFile,
strlen(writeNbestDir) + 1
+ strlen(sentid) + strlen(GZIP_SUFFIX) + 1);
sprintf(scoreFile, "%s/%s%s", writeNbestDir, sentid,
GZIP_SUFFIX);
rescoreNbest(*useLM, fname, scoreFile);
} else {
rescoreNbest(*useLM, fname, NULL);
}
}
}
/*
* Rescore stream of N-best hyps, if requested
*/
if (rescoreFile) {
File file(rescoreFile, "r");
LM *oldLM;
if (decipherLM) {
oldLM =
new DecipherNgram(*vocab, decipherOrder, !decipherNoBackoff);
assert(oldLM != 0);
oldLM->debugme(debug);
File file(decipherLM, "r");
if (!oldLM->read(file, limitVocab)) {
cerr << "format error in Decipher LM\n";
exit(1);
}
} else {
/*
* Create dummy LM for the sake of rescoreFile()
*/
oldLM = new NullLM(*vocab);
assert(oldLM != 0);
}
useLM->rescoreFile(file, rescoreLMW, rescoreWTW,
*oldLM, decipherLMW, decipherWTW, escape);
#ifdef DEBUG
delete oldLM;
#endif
}
if (generateSents) {
File outFile(stdout);
unsigned i;
for (i = 0; i < generateSents; i++) {
VocabString *sent = useLM->generateSentence(maxWordsPerLine,
(VocabString *)0);
Vocab::write(outFile, sent);
putchar('\n');
}
}
if (writeLM) {
File file(writeLM, "w");
useLM->write(file);
}
if (writeVocab) {
File file(writeVocab, "w");
vocab->write(file);
}
#ifdef DEBUG
if (&ngramLM->vocab != vocab) {
delete &ngramLM->vocab;
}
if (ngramLM != useLM) {
delete ngramLM;
}
delete useLM;
delete stopWords;
delete hiddenEvents;
delete classVocab;
delete vocab;
return 0;
#endif /* DEBUG */
exit(0);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -