📄 ngram.cc
字号:
/*
* ngram --
* Create and manipulate ngram (and related) models
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International. All Rights Reserved.";
static char RcsId[] = "@(#)$Id: ngram.cc,v 1.85 2006/01/09 18:08:21 stolcke Exp $";
#endif
#include <iostream>
using namespace std;
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
#include <assert.h>
#ifndef _MSC_VER
#include <unistd.h>
#endif
#include <string.h>
#include <time.h>
extern "C" {
void srand48(long); /* might be missing from math.h or stdlib.h */
}
#include "option.h"
#include "version.h"
#include "zio.h"
#include "File.h"
#include "Vocab.h"
#include "SubVocab.h"
#include "MultiwordVocab.h"
#include "MultiwordLM.h"
#include "NBest.h"
#include "TaggedVocab.h"
#include "Ngram.h"
#include "TaggedNgram.h"
#include "StopNgram.h"
#include "ClassNgram.h"
#include "SimpleClassNgram.h"
#include "DFNgram.h"
#include "SkipNgram.h"
#include "HiddenNgram.h"
#include "HiddenSNgram.h"
#include "NullLM.h"
#include "BayesMix.h"
#include "LoglinearMix.h"
#include "AdaptiveMix.h"
#include "AdaptiveMarginals.h"
#include "CacheLM.h"
#include "DynamicLM.h"
#include "DecipherNgram.h"
#include "HMMofNgrams.h"
#include "RefList.h"
#include "ProductNgram.h"
#include "Array.cc"
static int version = 0;
static unsigned order = defaultNgramOrder;
static unsigned debug = 0;
static char *pplFile = 0;
static char *escape = 0;
static char *countFile = 0;
static int countEntropy = 0;
static unsigned countOrder = 0;
static char *vocabFile = 0;
static char *noneventFile = 0;
static int limitVocab = 0;
static char *lmFile = 0;
static char *mixFile = 0;
static char *mixFile2 = 0;
static char *mixFile3 = 0;
static char *mixFile4 = 0;
static char *mixFile5 = 0;
static char *mixFile6 = 0;
static char *mixFile7 = 0;
static char *mixFile8 = 0;
static char *mixFile9 = 0;
static int bayesLength = -1; /* marks unset option */
static double bayesScale = 1.0;
static double mixLambda = 0.5;
static double mixLambda2 = 0.0;
static double mixLambda3 = 0.0;
static double mixLambda4 = 0.0;
static double mixLambda5 = 0.0;
static double mixLambda6 = 0.0;
static double mixLambda7 = 0.0;
static double mixLambda8 = 0.0;
static double mixLambda9 = 0.0;
static int loglinearMix = 0;
static int reverseSents = 0;
static char *writeLM = 0;
static char *writeVocab = 0;
static int memuse = 0;
static int renormalize = 0;
static double prune = 0.0;
static int pruneLowProbs = 0;
static int minprune = 2;
static int skipOOVs = 0;
static unsigned generateSents = 0;
static int seed = 0; /* default dynamically generated in main() */
static int df = 0;
static int skipNgram = 0;
static int hiddenS = 0;
static char *hiddenVocabFile = 0;
static int hiddenNot = 0;
static char *classesFile = 0;
static int simpleClasses = 0;
static int expandClasses = -1;
static unsigned expandExact = 0;
static int tagged = 0;
static int factored = 0;
static int toLower = 0;
static int multiwords = 0;
static int splitMultiwords = 0;
static int keepunk = 0;
static int keepnull = 1;
static char *mapUnknown = 0;
static int null = 0;
static unsigned cache = 0;
static double cacheLambda = 0.05;
static int dynamic = 0;
static double dynamicLambda = 0.05;
static char *noiseTag = 0;
static char *noiseVocabFile = 0;
static char *stopWordFile = 0;
static int decipherHack = 0;
static int hmm = 0;
static int adaptMix = 0;
static double adaptDecay = 1.0;
static unsigned adaptIters = 2;
static char *adaptMarginals = 0;
static double adaptMarginalsBeta = 0.5;
static int adaptMarginalsRatios = 0;
static char *baseMarginals = 0;
static char *rescoreNgramFile = 0;
/*
* N-Best related variables
*/
static char *nbestFile = 0;
static char *nbestFiles = 0;
static char *writeNbestDir = 0;
static int writeDecipherNbest = 0;
static int noReorder = 0;
static unsigned maxNbest = 0;
static char *rescoreFile = 0;
static char *decipherLM = 0;
static unsigned decipherOrder = 2;
static int decipherNoBackoff = 0;
static double decipherLMW = 8.0;
static double decipherWTW = 0.0;
static double rescoreLMW = 8.0;
static double rescoreWTW = 0.0;
static Option options[] = {
{ OPT_TRUE, "version", &version, "print version information" },
{ OPT_UINT, "order", &order, "max ngram order" },
{ OPT_UINT, "debug", &debug, "debugging level for lm" },
{ OPT_TRUE, "skipoovs", &skipOOVs, "skip n-gram contexts containing OOVs" },
{ OPT_TRUE, "df", &df, "use disfluency ngram model" },
{ OPT_TRUE, "tagged", &tagged, "use a tagged LM" },
{ OPT_TRUE, "factored", &factored, "use a factored LM" },
{ OPT_TRUE, "skip", &skipNgram, "use skip ngram model" },
{ OPT_TRUE, "hiddens", &hiddenS, "use hidden sentence ngram model" },
{ OPT_STRING, "hidden-vocab", &hiddenVocabFile, "hidden ngram vocabulary" },
{ OPT_TRUE, "hidden-not", &hiddenNot, "process overt hidden events" },
{ OPT_STRING, "classes", &classesFile, "class definitions" },
{ OPT_TRUE, "simple-classes", &simpleClasses, "use unique class model" },
{ OPT_INT, "expand-classes", &expandClasses, "expand class-model into word-model" },
{ OPT_UINT, "expand-exact", &expandExact, "compute expanded ngrams longer than this exactly" },
{ OPT_STRING, "stop-words", &stopWordFile, "stop-word vocabulary for stop-Ngram LM" },
{ OPT_TRUE, "decipher", &decipherHack, "use bigram model exactly as recognizer" },
{ OPT_TRUE, "unk", &keepunk, "vocabulary contains unknown word tag" },
{ OPT_FALSE, "nonull", &keepnull, "remove <NULL> in LM" },
{ OPT_STRING, "map-unk", &mapUnknown, "word to map unknown words to" },
{ OPT_TRUE, "tolower", &toLower, "map vocabulary to lowercase" },
{ OPT_TRUE, "multiwords", &multiwords, "split multiwords for LM evaluation" },
{ OPT_STRING, "ppl", &pplFile, "text file to compute perplexity from" },
{ OPT_STRING, "escape", &escape, "escape prefix to pass data through -ppl" },
{ OPT_STRING, "counts", &countFile, "count file to compute perplexity from" },
{ OPT_TRUE, "counts-entropy", &countEntropy, "compute entropy (not perplexity) from counts" },
{ OPT_UINT, "count-order", &countOrder, "max count order used by -counts" },
{ OPT_UINT, "gen", &generateSents, "number of random sentences to generate" },
{ OPT_INT, "seed", &seed, "seed for randomization" },
{ OPT_STRING, "vocab", &vocabFile, "vocab file" },
{ OPT_STRING, "nonevents", &noneventFile, "non-event vocabulary" },
{ OPT_TRUE, "limit-vocab", &limitVocab, "limit LM reading to specified vocabulary" },
{ OPT_STRING, "lm", &lmFile, "file in ARPA LM format" },
{ OPT_UINT, "bayes", &bayesLength, "context length for Bayes mixture LM" },
{ OPT_FLOAT, "bayes-scale", &bayesScale, "log likelihood scale for -bayes" },
{ OPT_STRING, "mix-lm", &mixFile, "LM to mix in" },
{ OPT_FLOAT, "lambda", &mixLambda, "mixture weight for -lm" },
{ OPT_STRING, "mix-lm2", &mixFile2, "second LM to mix in" },
{ OPT_FLOAT, "mix-lambda2", &mixLambda2, "mixture weight for -mix-lm2" },
{ OPT_STRING, "mix-lm3", &mixFile3, "third LM to mix in" },
{ OPT_FLOAT, "mix-lambda3", &mixLambda3, "mixture weight for -mix-lm3" },
{ OPT_STRING, "mix-lm4", &mixFile4, "fourth LM to mix in" },
{ OPT_FLOAT, "mix-lambda4", &mixLambda4, "mixture weight for -mix-lm4" },
{ OPT_STRING, "mix-lm5", &mixFile5, "fifth LM to mix in" },
{ OPT_FLOAT, "mix-lambda5", &mixLambda5, "mixture weight for -mix-lm5" },
{ OPT_STRING, "mix-lm6", &mixFile6, "sixth LM to mix in" },
{ OPT_FLOAT, "mix-lambda6", &mixLambda6, "mixture weight for -mix-lm6" },
{ OPT_STRING, "mix-lm7", &mixFile7, "seventh LM to mix in" },
{ OPT_FLOAT, "mix-lambda7", &mixLambda7, "mixture weight for -mix-lm7" },
{ OPT_STRING, "mix-lm8", &mixFile8, "eighth LM to mix in" },
{ OPT_FLOAT, "mix-lambda8", &mixLambda8, "mixture weight for -mix-lm8" },
{ OPT_STRING, "mix-lm9", &mixFile9, "ninth LM to mix in" },
{ OPT_FLOAT, "mix-lambda9", &mixLambda9, "mixture weight for -mix-lm9" },
{ OPT_TRUE, "loglinear-mix", &loglinearMix, "use log-linear mixture LM" },
{ OPT_TRUE, "null", &null, "use a null language model" },
{ OPT_UINT, "cache", &cache, "history length for cache language model" },
{ OPT_FLOAT, "cache-lambda", &cacheLambda, "interpolation weight for -cache" },
{ OPT_TRUE, "dynamic", &dynamic, "interpolate with a dynamic lm" },
{ OPT_TRUE, "hmm", &hmm, "use HMM of n-grams model" },
{ OPT_TRUE, "adapt-mix", &adaptMix, "use adaptive mixture of n-grams model" },
{ OPT_FLOAT, "adapt-decay", &adaptDecay, "history likelihood decay factor" },
{ OPT_UINT, "adapt-iters", &adaptIters, "EM iterations for adaptive mix" },
{ OPT_STRING, "adapt-marginals", &adaptMarginals, "unigram marginals to adapt base LM to" },
{ OPT_STRING, "base-marginals", &baseMarginals, "unigram marginals of base LM to" },
{ OPT_FLOAT, "adapt-marginals-beta", &adaptMarginalsBeta, "marginals adaptation weight" },
{ OPT_TRUE, "adapt-marginals-ratios", &adaptMarginalsRatios, "compute ratios between marginals-adapted and base probs" },
{ OPT_FLOAT, "dynamic-lambda", &dynamicLambda, "interpolation weight for -dynamic" },
{ OPT_TRUE, "reverse", &reverseSents, "reverse words" },
{ OPT_STRING, "rescore-ngram", &rescoreNgramFile, "recompute probs in N-gram LM" },
{ OPT_STRING, "write-lm", &writeLM, "re-write LM to file" },
{ OPT_STRING, "write-vocab", &writeVocab, "write LM vocab to file" },
{ OPT_TRUE, "renorm", &renormalize, "renormalize backoff weights" },
{ OPT_FLOAT, "prune", &prune, "prune redundant probs" },
{ OPT_UINT, "minprune", &minprune, "prune only ngrams at least this long" },
{ OPT_TRUE, "prune-lowprobs", &pruneLowProbs, "low probability N-grams" },
{ OPT_TRUE, "memuse", &memuse, "show memory usage" },
{ OPT_STRING, "nbest", &nbestFile, "nbest list file to rescore" },
{ OPT_STRING, "nbest-files", &nbestFiles, "list of N-best filenames" },
{ OPT_TRUE, "split-multiwords", &splitMultiwords, "split multiwords in N-best lists" },
{ OPT_STRING, "write-nbest-dir", &writeNbestDir, "output directory for N-best rescoring" },
{ OPT_TRUE, "decipher-nbest", &writeDecipherNbest, "output Decipher n-best format" },
{ OPT_UINT, "max-nbest", &maxNbest, "maximum number of hyps to consider" },
{ OPT_TRUE, "no-reorder", &noReorder, "don't reorder N-best hyps after rescoring" },
{ OPT_STRING, "rescore", &rescoreFile, "hyp stream input file to rescore" },
{ OPT_STRING, "decipher-lm", &decipherLM, "DECIPHER(TM) LM for nbest list generation" },
{ OPT_UINT, "decipher-order", &decipherOrder, "ngram order for -decipher-lm" },
{ OPT_TRUE, "decipher-nobackoff", &decipherNoBackoff, "disable backoff hack in recognizer LM" },
{ OPT_FLOAT, "decipher-lmw", &decipherLMW, "DECIPHER(TM) LM weight" },
{ OPT_FLOAT, "decipher-wtw", &decipherWTW, "DECIPHER(TM) word transition weight" },
{ OPT_FLOAT, "rescore-lmw", &rescoreLMW, "rescoring LM weight" },
{ OPT_FLOAT, "rescore-wtw", &rescoreWTW, "rescoring word transition weight" },
{ OPT_STRING, "noise", &noiseTag, "noise tag to skip" },
{ OPT_STRING, "noise-vocab", &noiseVocabFile, "noise vocabulary to skip" },
};
/*
* Rescore N-best list
*/
void
rescoreNbest(LM &lm, const char *inFilename, const char *outFilename)
{
NBestList nbList(lm.vocab, maxNbest, splitMultiwords);
File inlist(inFilename, "r");
if (!nbList.read(inlist)) {
cerr << "format error in nbest file\n";
exit(1);
}
if (nbList.numHyps() == 0) {
cerr << "warning: " << inFilename << " is empty, not rescored\n";
return;
}
if (decipherLM) {
/*
* decipherNoBackoff prevents the Decipher LM from simulating
* backoff paths when they score higher than direct probabilities.
*/
DecipherNgram oldLM(lm.vocab, decipherOrder, !decipherNoBackoff);
oldLM.debugme(debug);
File file(decipherLM, "r");
if (!oldLM.read(file, limitVocab)) {
cerr << "format error in Decipher LM\n";
exit(1);
}
nbList.decipherFix(oldLM, decipherLMW, decipherWTW);
}
nbList.rescoreHyps(lm, rescoreLMW, rescoreWTW);
if (!noReorder) {
nbList.sortHyps();
}
if (outFilename) {
File sout(outFilename, "w");
nbList.write(sout, writeDecipherNbest);
} else {
File sout(stdout);
nbList.write(sout, writeDecipherNbest);
}
}
LM *
makeMixLM(const char *filename, Vocab &vocab, SubVocab *classVocab,
unsigned order, LM *oldLM, double lambda1, double lambda2)
{
File file(filename, "r");
/*
* create factored LM if -factored was specified,
* class-ngram if -classes were specified,
* and otherwise a regular ngram
*/
Ngram *lm = factored ?
new ProductNgram((ProductVocab &)vocab, order) :
(classVocab != 0) ?
(simpleClasses ?
new SimpleClassNgram(vocab, *classVocab, order) :
new ClassNgram(vocab, *classVocab, order)) :
new Ngram(vocab, order);
assert(lm != 0);
lm->debugme(debug);
lm->skipOOVs() = skipOOVs;
if (!lm->read(file, limitVocab)) {
cerr << "format error in mix-lm file " << filename << endl;
exit(1);
}
/*
* Each class LM needs to read the class definitions
*/
if (classesFile != 0) {
File file(classesFile, "r");
((ClassNgram *)lm)->readClasses(file);
}
/*
* Compute mixture lambda (make sure 0/0 = 0)
*/
Prob lambda = (lambda1 == 0.0) ? 0.0 : lambda1/lambda2;
if (oldLM == 0) {
return lm;
} else if (loglinearMix) {
/*
* log-linear mixture
*/
LM *newLM = new LoglinearMix(vocab, *lm, *oldLM, lambda);
assert(newLM != 0);
newLM->debugme(debug);
return newLM;
} else if (bayesLength < 0) {
/*
* static mixture
*/
((Ngram *)oldLM)->mixProbs(*lm, 1.0 - lambda);
delete lm;
return oldLM;
} else {
/*
* dymamic Bayesian mixture
*/
LM *newLM = new BayesMix(vocab, *lm, *oldLM,
bayesLength, lambda, bayesScale);
assert(newLM != 0);
newLM->debugme(debug);
return newLM;
}
}
int
main(int argc, char **argv)
{
setlocale(LC_CTYPE, "");
setlocale(LC_COLLATE, "");
/* set default seed for randomization */
#ifndef _MSC_VER
seed = time(NULL) + getpid();
#else
seed = time(NULL);
#endif
Opt_Parse(argc, argv, options, Opt_Number(options), 0);
if (version) {
printVersion(RcsId);
exit(0);
}
if (hmm + adaptMix + decipherHack + tagged +
skipNgram + hiddenS + df + factored + (hiddenVocabFile != 0) +
(classesFile != 0 || expandClasses >= 0) + (stopWordFile != 0) > 1)
{
cerr << "HMM, AdaptiveMix, Decipher, tagged, factored, DF, hidden N-gram, hidden-S, class N-gram, skip N-gram and stop-word N-gram models are mutually exclusive\n";
exit(2);
}
/*
* Set random seed
*/
srand48((long)seed);
/*
* Construct language model
*/
Vocab *vocab;
Ngram *ngramLM;
LM *useLM;
if (factored + tagged + multiwords > 1) {
cerr << "factored, tagged, and multiword vocabularies are mutually exclusive\n";
exit(2);
}
vocab = tagged ? new TaggedVocab :
multiwords ? new MultiwordVocab :
factored ? new ProductVocab :
new Vocab;
assert(vocab != 0);
vocab->unkIsWord() = keepunk ? true : false;
vocab->toLower() = toLower ? true : false;
if (factored) {
((ProductVocab *)vocab)->nullIsWord() = keepnull ? true : false;
}
/*
* Change unknown word string if requested
*/
if (mapUnknown) {
vocab->remove(vocab->unkIndex());
vocab->unkIndex() = vocab->addWord(mapUnknown);
}
if (vocabFile) {
File file(vocabFile, "r");
vocab->read(file);
}
if (noneventFile) {
/*
* create temporary sub-vocabulary for non-event words
*/
SubVocab nonEvents(*vocab);
File file(noneventFile, "r");
nonEvents.read(file);
vocab->addNonEvents(nonEvents);
}
SubVocab *stopWords = 0;
if (stopWordFile != 0) {
stopWords = new SubVocab(*vocab);
assert(stopWords);
File file(stopWordFile, "r");
stopWords->read(file);
}
SubVocab *hiddenEvents = 0;
if (hiddenVocabFile != 0) {
hiddenEvents = new SubVocab(*vocab);
assert(hiddenEvents);
File file(hiddenVocabFile, "r");
hiddenEvents->read(file);
}
SubVocab *classVocab = 0;
if (classesFile != 0 || expandClasses >= 0) {
classVocab = new SubVocab(*vocab);
assert(classVocab);
/*
* limitVocab on class N-grams only works if the classes are
* in the vocabulary at read time. We ensure this by reading
* the class names (the first column of the class definitions)
* into the vocabulary.
*/
if (limitVocab) {
File file(classesFile, "r");
classVocab->read(file);
}
}
ngramLM =
decipherHack ? new DecipherNgram(*vocab, order, !decipherNoBackoff) :
df ? new DFNgram(*vocab, order) :
skipNgram ? new SkipNgram(*vocab, order) :
hiddenS ? new HiddenSNgram(*vocab, order) :
tagged ? new TaggedNgram(*(TaggedVocab *)vocab, order) :
factored ? new ProductNgram(*(ProductVocab *)vocab, order) :
(stopWordFile != 0) ? new StopNgram(*vocab, *stopWords, order):
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -