📄 lattice-tool.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 3 页
字号:
上一页 1 23

    lm->debugme(debug);

    if (!lm->read(file, limitVocab)) {
	cerr << "format error in mix-lm file " << filename << endl;
	exit(1);
    }

    /*
     * Each class LM needs to read the class definitions
     */
    if (classesFile != 0) {
	File file(classesFile, "r");
	((ClassNgram *)lm)->readClasses(file);
    }

    if (oldLM) {
	/*
	 * Compute mixture lambda (make sure 0/0 = 0)
	 */
	Prob lambda = (lambda1 == 0.0) ? 0.0 : lambda1/lambda2;

	LM *newLM = new BayesMix(vocab, *lm, *oldLM, 0, lambda);
	assert(newLM != 0);

	newLM->debugme(debug);

	return newLM;
    } else {
	return lm;
    }
}

int 
main (int argc, char *argv[])
{
    setlocale(LC_CTYPE, "");
    Opt_Parse(argc, argv, options, Opt_Number(options), 0);

    if (version) {
	printVersion(RcsId);
	exit(0);
    }

    if (!inLattice && !inLatticeList) {
        cerr << "need to specify at least one input file!\n";
	return 0; 
    }

    if (factored &&
	(classesFile ||
	 splitMultiwords || splitMultiwordsAfterLM || useMultiwordLM))
    {
	cerr << "factored LMs cannot be used with class definitions or multiwords\n";
	exit(2);
    }

    if ((factored || classesFile || mixFile || useMultiwordLM) &&
	oldExpansion && compactExpansion)
    {
        cerr << "cannot use factored LM, class-ngram LM, mixture LM, or multiword LM wrapper for old compact trigram expansion\n";
        exit(2);
    }

    if (hiddenVocabFile && dictFile && dictAlign) {
	cerr << "cannot use both -hidden-vocab and -dictionary-align, choose one\n";
        exit(2);
    }

    /*
     * Use multiword vocab in case we need it for -multiwords processing
     */
    Vocab *vocab;

    if (factored) {
	vocab = new ProductVocab;
    } else if (splitMultiwords || splitMultiwordsAfterLM || useMultiwordLM) {
	vocab = new MultiwordVocab(multiChar);
    } else {
	vocab = new Vocab;
    }
    assert(vocab != 0);
    vocab->unkIsWord() = useUnk ? true : false;
    vocab->toLower() = toLower ? true : false;

    if (factored) {
	((ProductVocab *)vocab)->nullIsWord() = keepNullFactors ? true : false;
    }

    /*
     * Change unknown word string if requested
     */
    if (mapUnknown) {
	vocab->remove(vocab->unkIndex());
	vocab->unkIndex() = vocab->addWord(mapUnknown);
    }

    /*
     * Read predefined vocabulary
     * (required by -limit-vocab and useful with -unk)
     */
    if (vocabFile) {
	File file(vocabFile, "r");
	vocab->read(file);
    }

    if (noneventFile) {
	/*
	 * If pause is treated as a regular word also don't consider it a 
	 * non-event for LM purposes.
	 */
	if (keepPause) {
	    vocab->removeNonEvent(vocab->pauseIndex());
	}

	/*
	 * create temporary sub-vocabulary for non-event words
	 */
	SubVocab nonEvents(*vocab);

	File file(noneventFile, "r");
	nonEvents.read(file);
	vocab->addNonEvents(nonEvents);
    }

    /*
     * Optionally read a subvocabulary that is to be kept separate from
     * regular words during alignment
     */
    SubVocab hiddenVocab(*vocab);
    if (hiddenVocabFile) {
	File file(hiddenVocabFile, "r");

	hiddenVocab.read(file);
    }

    Ngram *ngram;

    /*
     * create base N-gram model (either factored, class- or word-based)
     */
    SubVocab *classVocab = 0;
    if (factored) {
	  ngram = new ProductNgram(*(ProductVocab *)vocab, order);
    } else if (classesFile) {
        classVocab = new SubVocab(*vocab);
	assert(classVocab != 0);

	if (simpleClasses) {
	    ngram = new SimpleClassNgram(*vocab, *classVocab, order);
	} else {
	    ngram = new ClassNgram(*vocab, *classVocab, order);

	    if (order > 2 && !oldExpansion) {
		cerr << "warning: general class LM does not allow efficient lattice expansion; consider using -simple-classes\n";
	    }
	}
    } else {
	ngram = new Ngram(*vocab, order);
    }
    assert(ngram != 0);

    ngram->debugme(debug); 
    if (lmFile) {
      File file1(lmFile, "r");
      if (!ngram->read(file1, limitVocab)) {
	cerr << "format error in lm file\n";
	exit(1);
      }
    }

    if (classesFile) {
	File file(classesFile, "r");
	((ClassNgram *)ngram)->readClasses(file);
    }

    SubVocab noiseVocab(*vocab);
    // -pau- is ignored in WER computation by default
    if (!keepPause) {
	noiseVocab.addWord(vocab->pauseIndex());
    }

    // read additional "noise" words to be ignored in WER computation
    if (noiseVocabFile) {
	File file(noiseVocabFile, "r");
	noiseVocab.read(file);
    }

    SubVocab ignoreVocab(*vocab);

    // read additional words to ignore
    if (ignoreVocabFile) {
	File file(ignoreVocabFile, "r");
	ignoreVocab.read(file);
    } else if (!keepPause) {
	// -pau- is ignored by default
	ignoreVocab.addWord(vocab->pauseIndex());
    }

    /*
     * Prepare dictionary for pronunciation scoring
     */
    Vocab dictVocab;
    VocabMultiMap dictionary(*vocab, dictVocab, intlogs);
    if (dictFile) {
        File file(dictFile, "r");

        if (!dictionary.read(file)) {
            cerr << "format error in dictionary file\n";
            exit(1);
        }
    }

    /*
     * Build the LM used for lattice expansion
     */
    LM *useLM = ngram;

    if (mixFile) {
	/*
	 * create a Bayes mixture LM 
	 */
	double mixLambda1 = 1.0 - mixLambda - mixLambda2 - mixLambda3 -
				mixLambda4 - mixLambda5 - mixLambda6 -
				mixLambda7 - mixLambda8 - mixLambda9;

	useLM = makeMixLM(mixFile, *vocab, classVocab, order, useLM,
				mixLambda1,
				mixLambda + mixLambda1);

	if (mixFile2) {
	    useLM = makeMixLM(mixFile2, *vocab, classVocab, order, useLM,
				mixLambda2,
				mixLambda + mixLambda1 + mixLambda2);
	}
	if (mixFile3) {
	    useLM = makeMixLM(mixFile3, *vocab, classVocab, order, useLM,
				mixLambda3,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3);
	}
	if (mixFile4) {
	    useLM = makeMixLM(mixFile4, *vocab, classVocab, order, useLM,
				mixLambda4,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4);
	}
	if (mixFile5) {
	    useLM = makeMixLM(mixFile5, *vocab, classVocab, order, useLM,
				mixLambda5,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5);
	}
	if (mixFile6) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda6,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5 +
				mixLambda6);
	}
	if (mixFile7) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda7,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5 +
				mixLambda6 + mixLambda7);
	}
	if (mixFile8) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda8,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5 +
				mixLambda6 + mixLambda7 + mixLambda8);
	}
	if (mixFile9) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda9, 1.0);
	}
    }

    /*
     * create multiword wrapper around LM so far, if requested
     */
    if (useMultiwordLM) {
	useLM = new MultiwordLM((MultiwordVocab &)*vocab, *useLM);
	assert(useLM != 0);
    } 

    Lattice *lattice2 = 0;

    if (inLattice2) {
	lattice2 = new Lattice(*vocab);
	lattice2->debugme(debug);

        File file(inLattice2, "r");
        if (!(readHTK ? lattice2->readHTK(file, &htkparms, useHTKnulls)
		      : lattice2->readPFSGs(file)))
	{
	    cerr << "error reading second lattice operand\n";
	    exit(1);
	}
    } else {
	if (operation) {
	    cerr << "lattice operation needs second lattice\n";
	    exit(2);
	}
    }

    /*
     * create Ngram count trie
     */
    NgramCounts<FloatCount> ngramCounts(*vocab, order);

    RefList reflist(*vocab, refList);
    if (refList || refFile) {
	File file1(refList ? refList : refFile, "r"); 
	reflist.read(file1, true); 	// add ref words to vocabulary

    }
    if (writeRefs) {
	File file1(writeRefs, "w");
	reflist.write(file1);
    }

    if (inLattice) { 
	VocabIndex *refVocabIndex = 0;

	if (refList) {
	    refVocabIndex = reflist.findRef(idFromFilename(inLattice));
	} else if (refFile) {
	    refVocabIndex = reflist.findRefByNumber(0);
	}

	makeArray(char, fileName,
		  outLatticeDir ? strlen(outLatticeDir) + 1024
		                : strlen(LATTICE_NONAME) + 1); 
	if (!outLattice && outLatticeDir) {
	    char *sentid = strrchr(inLattice, '/');
	    if (sentid != NULL) {  
		sentid += 1;
	    } else {
		sentid = inLattice;
	    }

	    sprintf(fileName, "%s/%.1023s", outLatticeDir, sentid);
	} else {
	    // make sure we have some name
	    strcpy(fileName, LATTICE_NONAME);
	}

        processLattice(inLattice, outLattice ? outLattice : fileName, lattice2,
			ngramCounts, *useLM, *vocab, hiddenVocab, dictionary,
			ignoreVocab, noiseVocab, refVocabIndex);
    } 

    if (inLatticeList) {
        if (outLatticeDir) {
	    if (MKDIR(outLatticeDir) < 0) {
		if (errno == EEXIST) {
		    if (!overwrite) {
			cerr << "Dir " << outLatticeDir
			     << " already exists, please give another one\n";
			exit(2);
		    }
		} else {
		    perror(outLatticeDir);
		    exit(1);
		}
	    }
	}
	
	File listOfFiles(inLatticeList, "r"); 
	makeArray(char, fileName,
		  outLatticeDir ? strlen(outLatticeDir) + 1024 : 1); 
	int flag; 
	char buffer[1024]; 
	unsigned latticeCount = 0;
	while ((flag = fscanf(listOfFiles, " %1024s", buffer)) == 1) {

	    cerr << "processing file " << buffer << "\n"; 

	    char *sentid = strrchr(buffer, '/');
	    if (sentid != NULL) {  
	    	sentid += 1;
	    } else {
	    	sentid = buffer;
	    }

	    if (outLatticeDir) {
		sprintf(fileName, "%s/%s", outLatticeDir, sentid);
		cerr << "     to be dumped to " << fileName << "\n"; 
	    } else {
		fileName[0] = '\0';
	    }

	    VocabIndex *refVocabIndex = 0;
	    if (refList) {
		refVocabIndex = reflist.findRef(idFromFilename(buffer));
	    } else if (refFile) {
		refVocabIndex = reflist.findRefByNumber(latticeCount);
	    }
	    processLattice(buffer, fileName, lattice2, ngramCounts,
					*useLM, *vocab,
					hiddenVocab, dictionary,
					ignoreVocab, noiseVocab, refVocabIndex); 

	    latticeCount ++;
	}
    }

    if (writeNgrams) {
	/*
	 * prune counts if specified
	 */
	if (minCount > 0.0) {
	    ngramCounts.pruneCounts(minCount);
	}

	File file(writeNgrams, "w");

	if (debug >= DebugPrintFunctionality) {
	    cerr << "writing ngram counts to " << writeNgrams << endl;
	}
	ngramCounts.write(file, 0, true);
    }

    delete vocab;
    delete classVocab;

    exit(0);
}
上一页 1 23
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -