📄 lattice-tool.cc
字号:
lm->debugme(debug);
if (!lm->read(file, limitVocab)) {
cerr << "format error in mix-lm file " << filename << endl;
exit(1);
}
/*
* Each class LM needs to read the class definitions
*/
if (classesFile != 0) {
File file(classesFile, "r");
((ClassNgram *)lm)->readClasses(file);
}
if (oldLM) {
/*
* Compute mixture lambda (make sure 0/0 = 0)
*/
Prob lambda = (lambda1 == 0.0) ? 0.0 : lambda1/lambda2;
LM *newLM = new BayesMix(vocab, *lm, *oldLM, 0, lambda);
assert(newLM != 0);
newLM->debugme(debug);
return newLM;
} else {
return lm;
}
}
int
main (int argc, char *argv[])
{
setlocale(LC_CTYPE, "");
Opt_Parse(argc, argv, options, Opt_Number(options), 0);
if (version) {
printVersion(RcsId);
exit(0);
}
if (!inLattice && !inLatticeList) {
cerr << "need to specify at least one input file!\n";
return 0;
}
if (factored &&
(classesFile ||
splitMultiwords || splitMultiwordsAfterLM || useMultiwordLM))
{
cerr << "factored LMs cannot be used with class definitions or multiwords\n";
exit(2);
}
if ((factored || classesFile || mixFile || useMultiwordLM) &&
oldExpansion && compactExpansion)
{
cerr << "cannot use factored LM, class-ngram LM, mixture LM, or multiword LM wrapper for old compact trigram expansion\n";
exit(2);
}
if (hiddenVocabFile && dictFile && dictAlign) {
cerr << "cannot use both -hidden-vocab and -dictionary-align, choose one\n";
exit(2);
}
/*
* Use multiword vocab in case we need it for -multiwords processing
*/
Vocab *vocab;
if (factored) {
vocab = new ProductVocab;
} else if (splitMultiwords || splitMultiwordsAfterLM || useMultiwordLM) {
vocab = new MultiwordVocab(multiChar);
} else {
vocab = new Vocab;
}
assert(vocab != 0);
vocab->unkIsWord() = useUnk ? true : false;
vocab->toLower() = toLower ? true : false;
if (factored) {
((ProductVocab *)vocab)->nullIsWord() = keepNullFactors ? true : false;
}
/*
* Change unknown word string if requested
*/
if (mapUnknown) {
vocab->remove(vocab->unkIndex());
vocab->unkIndex() = vocab->addWord(mapUnknown);
}
/*
* Read predefined vocabulary
* (required by -limit-vocab and useful with -unk)
*/
if (vocabFile) {
File file(vocabFile, "r");
vocab->read(file);
}
if (noneventFile) {
/*
* If pause is treated as a regular word also don't consider it a
* non-event for LM purposes.
*/
if (keepPause) {
vocab->removeNonEvent(vocab->pauseIndex());
}
/*
* create temporary sub-vocabulary for non-event words
*/
SubVocab nonEvents(*vocab);
File file(noneventFile, "r");
nonEvents.read(file);
vocab->addNonEvents(nonEvents);
}
/*
* Optionally read a subvocabulary that is to be kept separate from
* regular words during alignment
*/
SubVocab hiddenVocab(*vocab);
if (hiddenVocabFile) {
File file(hiddenVocabFile, "r");
hiddenVocab.read(file);
}
Ngram *ngram;
/*
* create base N-gram model (either factored, class- or word-based)
*/
SubVocab *classVocab = 0;
if (factored) {
ngram = new ProductNgram(*(ProductVocab *)vocab, order);
} else if (classesFile) {
classVocab = new SubVocab(*vocab);
assert(classVocab != 0);
if (simpleClasses) {
ngram = new SimpleClassNgram(*vocab, *classVocab, order);
} else {
ngram = new ClassNgram(*vocab, *classVocab, order);
if (order > 2 && !oldExpansion) {
cerr << "warning: general class LM does not allow efficient lattice expansion; consider using -simple-classes\n";
}
}
} else {
ngram = new Ngram(*vocab, order);
}
assert(ngram != 0);
ngram->debugme(debug);
if (lmFile) {
File file1(lmFile, "r");
if (!ngram->read(file1, limitVocab)) {
cerr << "format error in lm file\n";
exit(1);
}
}
if (classesFile) {
File file(classesFile, "r");
((ClassNgram *)ngram)->readClasses(file);
}
SubVocab noiseVocab(*vocab);
// -pau- is ignored in WER computation by default
if (!keepPause) {
noiseVocab.addWord(vocab->pauseIndex());
}
// read additional "noise" words to be ignored in WER computation
if (noiseVocabFile) {
File file(noiseVocabFile, "r");
noiseVocab.read(file);
}
SubVocab ignoreVocab(*vocab);
// read additional words to ignore
if (ignoreVocabFile) {
File file(ignoreVocabFile, "r");
ignoreVocab.read(file);
} else if (!keepPause) {
// -pau- is ignored by default
ignoreVocab.addWord(vocab->pauseIndex());
}
/*
* Prepare dictionary for pronunciation scoring
*/
Vocab dictVocab;
VocabMultiMap dictionary(*vocab, dictVocab, intlogs);
if (dictFile) {
File file(dictFile, "r");
if (!dictionary.read(file)) {
cerr << "format error in dictionary file\n";
exit(1);
}
}
/*
* Build the LM used for lattice expansion
*/
LM *useLM = ngram;
if (mixFile) {
/*
* create a Bayes mixture LM
*/
double mixLambda1 = 1.0 - mixLambda - mixLambda2 - mixLambda3 -
mixLambda4 - mixLambda5 - mixLambda6 -
mixLambda7 - mixLambda8 - mixLambda9;
useLM = makeMixLM(mixFile, *vocab, classVocab, order, useLM,
mixLambda1,
mixLambda + mixLambda1);
if (mixFile2) {
useLM = makeMixLM(mixFile2, *vocab, classVocab, order, useLM,
mixLambda2,
mixLambda + mixLambda1 + mixLambda2);
}
if (mixFile3) {
useLM = makeMixLM(mixFile3, *vocab, classVocab, order, useLM,
mixLambda3,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3);
}
if (mixFile4) {
useLM = makeMixLM(mixFile4, *vocab, classVocab, order, useLM,
mixLambda4,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4);
}
if (mixFile5) {
useLM = makeMixLM(mixFile5, *vocab, classVocab, order, useLM,
mixLambda5,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5);
}
if (mixFile6) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda6,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6);
}
if (mixFile7) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda7,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6 + mixLambda7);
}
if (mixFile8) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda8,
mixLambda + mixLambda1 + mixLambda2 +
mixLambda3 + mixLambda4 + mixLambda5 +
mixLambda6 + mixLambda7 + mixLambda8);
}
if (mixFile9) {
useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
mixLambda9, 1.0);
}
}
/*
* create multiword wrapper around LM so far, if requested
*/
if (useMultiwordLM) {
useLM = new MultiwordLM((MultiwordVocab &)*vocab, *useLM);
assert(useLM != 0);
}
Lattice *lattice2 = 0;
if (inLattice2) {
lattice2 = new Lattice(*vocab);
lattice2->debugme(debug);
File file(inLattice2, "r");
if (!(readHTK ? lattice2->readHTK(file, &htkparms, useHTKnulls)
: lattice2->readPFSGs(file)))
{
cerr << "error reading second lattice operand\n";
exit(1);
}
} else {
if (operation) {
cerr << "lattice operation needs second lattice\n";
exit(2);
}
}
/*
* create Ngram count trie
*/
NgramCounts<FloatCount> ngramCounts(*vocab, order);
RefList reflist(*vocab, refList);
if (refList || refFile) {
File file1(refList ? refList : refFile, "r");
reflist.read(file1, true); // add ref words to vocabulary
}
if (writeRefs) {
File file1(writeRefs, "w");
reflist.write(file1);
}
if (inLattice) {
VocabIndex *refVocabIndex = 0;
if (refList) {
refVocabIndex = reflist.findRef(idFromFilename(inLattice));
} else if (refFile) {
refVocabIndex = reflist.findRefByNumber(0);
}
makeArray(char, fileName,
outLatticeDir ? strlen(outLatticeDir) + 1024
: strlen(LATTICE_NONAME) + 1);
if (!outLattice && outLatticeDir) {
char *sentid = strrchr(inLattice, '/');
if (sentid != NULL) {
sentid += 1;
} else {
sentid = inLattice;
}
sprintf(fileName, "%s/%.1023s", outLatticeDir, sentid);
} else {
// make sure we have some name
strcpy(fileName, LATTICE_NONAME);
}
processLattice(inLattice, outLattice ? outLattice : fileName, lattice2,
ngramCounts, *useLM, *vocab, hiddenVocab, dictionary,
ignoreVocab, noiseVocab, refVocabIndex);
}
if (inLatticeList) {
if (outLatticeDir) {
if (MKDIR(outLatticeDir) < 0) {
if (errno == EEXIST) {
if (!overwrite) {
cerr << "Dir " << outLatticeDir
<< " already exists, please give another one\n";
exit(2);
}
} else {
perror(outLatticeDir);
exit(1);
}
}
}
File listOfFiles(inLatticeList, "r");
makeArray(char, fileName,
outLatticeDir ? strlen(outLatticeDir) + 1024 : 1);
int flag;
char buffer[1024];
unsigned latticeCount = 0;
while ((flag = fscanf(listOfFiles, " %1024s", buffer)) == 1) {
cerr << "processing file " << buffer << "\n";
char *sentid = strrchr(buffer, '/');
if (sentid != NULL) {
sentid += 1;
} else {
sentid = buffer;
}
if (outLatticeDir) {
sprintf(fileName, "%s/%s", outLatticeDir, sentid);
cerr << " to be dumped to " << fileName << "\n";
} else {
fileName[0] = '\0';
}
VocabIndex *refVocabIndex = 0;
if (refList) {
refVocabIndex = reflist.findRef(idFromFilename(buffer));
} else if (refFile) {
refVocabIndex = reflist.findRefByNumber(latticeCount);
}
processLattice(buffer, fileName, lattice2, ngramCounts,
*useLM, *vocab,
hiddenVocab, dictionary,
ignoreVocab, noiseVocab, refVocabIndex);
latticeCount ++;
}
}
if (writeNgrams) {
/*
* prune counts if specified
*/
if (minCount > 0.0) {
ngramCounts.pruneCounts(minCount);
}
File file(writeNgrams, "w");
if (debug >= DebugPrintFunctionality) {
cerr << "writing ngram counts to " << writeNgrams << endl;
}
ngramCounts.write(file, 0, true);
}
delete vocab;
delete classVocab;
exit(0);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -