📄 ngram.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 2 页
字号:
12 下一页
 /*
 * ngram --
 *	Create and manipulate ngram (and related) models
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Id: ngram.cc,v 1.85 2006/01/09 18:08:21 stolcke Exp $";
#endif

#include <iostream>
using namespace std;
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
#include <assert.h>
#ifndef _MSC_VER
#include <unistd.h>
#endif
#include <string.h>
#include <time.h>

extern "C" {
	void srand48(long);           /* might be missing from math.h or stdlib.h */
}


#include "option.h"
#include "version.h"
#include "zio.h"
#include "File.h"
#include "Vocab.h"
#include "SubVocab.h"
#include "MultiwordVocab.h"
#include "MultiwordLM.h"
#include "NBest.h"
#include "TaggedVocab.h"
#include "Ngram.h"
#include "TaggedNgram.h"
#include "StopNgram.h"
#include "ClassNgram.h"
#include "SimpleClassNgram.h"
#include "DFNgram.h"
#include "SkipNgram.h"
#include "HiddenNgram.h"
#include "HiddenSNgram.h"
#include "NullLM.h"
#include "BayesMix.h"
#include "LoglinearMix.h"
#include "AdaptiveMix.h"
#include "AdaptiveMarginals.h"
#include "CacheLM.h"
#include "DynamicLM.h"
#include "DecipherNgram.h"
#include "HMMofNgrams.h"
#include "RefList.h"
#include "ProductNgram.h"
#include "Array.cc"

static int version = 0;
static unsigned order = defaultNgramOrder;
static unsigned debug = 0;
static char *pplFile = 0;
static char *escape = 0;
static char *countFile = 0;
static int countEntropy = 0;
static unsigned countOrder = 0;
static char *vocabFile = 0;
static char *noneventFile = 0;
static int limitVocab = 0;
static char *lmFile  = 0;
static char *mixFile  = 0;
static char *mixFile2 = 0;
static char *mixFile3 = 0;
static char *mixFile4 = 0;
static char *mixFile5 = 0;
static char *mixFile6 = 0;
static char *mixFile7 = 0;
static char *mixFile8 = 0;
static char *mixFile9 = 0;
static int bayesLength = -1;	/* marks unset option */
static double bayesScale = 1.0;
static double mixLambda = 0.5;
static double mixLambda2 = 0.0;
static double mixLambda3 = 0.0;
static double mixLambda4 = 0.0;
static double mixLambda5 = 0.0;
static double mixLambda6 = 0.0;
static double mixLambda7 = 0.0;
static double mixLambda8 = 0.0;
static double mixLambda9 = 0.0;
static int loglinearMix = 0;
static int reverseSents = 0;
static char *writeLM  = 0;
static char *writeVocab  = 0;
static int memuse = 0;
static int renormalize = 0;
static double prune = 0.0;
static int pruneLowProbs = 0;
static int minprune = 2;
static int skipOOVs = 0;
static unsigned generateSents = 0;
static int seed = 0;  /* default dynamically generated in main() */
static int df = 0;
static int skipNgram = 0;
static int hiddenS = 0;
static char *hiddenVocabFile = 0;
static int hiddenNot = 0;
static char *classesFile = 0;
static int simpleClasses = 0;
static int expandClasses = -1;
static unsigned expandExact = 0;
static int tagged = 0;
static int factored = 0;
static int toLower = 0;
static int multiwords = 0;
static int splitMultiwords = 0;
static int keepunk = 0;
static int keepnull = 1;
static char *mapUnknown = 0;
static int null = 0;
static unsigned cache = 0;
static double cacheLambda = 0.05;
static int dynamic = 0;
static double dynamicLambda = 0.05;
static char *noiseTag = 0;
static char *noiseVocabFile = 0;
static char *stopWordFile = 0;
static int decipherHack = 0;
static int hmm = 0;
static int adaptMix = 0;
static double adaptDecay = 1.0;
static unsigned adaptIters = 2;
static char *adaptMarginals = 0;
static double adaptMarginalsBeta = 0.5;
static int adaptMarginalsRatios = 0;
static char *baseMarginals = 0;
static char *rescoreNgramFile = 0;

/*
 * N-Best related variables
 */

static char *nbestFile = 0;
static char *nbestFiles = 0;
static char *writeNbestDir = 0;
static int writeDecipherNbest = 0;
static int noReorder = 0;
static unsigned maxNbest = 0;
static char *rescoreFile = 0;
static char *decipherLM = 0;
static unsigned decipherOrder = 2;
static int decipherNoBackoff = 0;
static double decipherLMW = 8.0;
static double decipherWTW = 0.0;
static double rescoreLMW = 8.0;
static double rescoreWTW = 0.0;

static Option options[] = {
    { OPT_TRUE, "version", &version, "print version information" },
    { OPT_UINT, "order", &order, "max ngram order" },
    { OPT_UINT, "debug", &debug, "debugging level for lm" },
    { OPT_TRUE, "skipoovs", &skipOOVs, "skip n-gram contexts containing OOVs" },
    { OPT_TRUE, "df", &df, "use disfluency ngram model" },
    { OPT_TRUE, "tagged", &tagged, "use a tagged LM" },
    { OPT_TRUE, "factored", &factored, "use a factored LM" },
    { OPT_TRUE, "skip", &skipNgram, "use skip ngram model" },
    { OPT_TRUE, "hiddens", &hiddenS, "use hidden sentence ngram model" },
    { OPT_STRING, "hidden-vocab", &hiddenVocabFile, "hidden ngram vocabulary" },
    { OPT_TRUE, "hidden-not", &hiddenNot, "process overt hidden events" },
    { OPT_STRING, "classes", &classesFile, "class definitions" },
    { OPT_TRUE, "simple-classes", &simpleClasses, "use unique class model" },
    { OPT_INT, "expand-classes", &expandClasses, "expand class-model into word-model" },
    { OPT_UINT, "expand-exact", &expandExact, "compute expanded ngrams longer than this exactly" },
    { OPT_STRING, "stop-words", &stopWordFile, "stop-word vocabulary for stop-Ngram LM" },
    { OPT_TRUE, "decipher", &decipherHack, "use bigram model exactly as recognizer" },
    { OPT_TRUE, "unk", &keepunk, "vocabulary contains unknown word tag" },
    { OPT_FALSE, "nonull", &keepnull, "remove <NULL> in LM" },
    { OPT_STRING, "map-unk", &mapUnknown, "word to map unknown words to" },
    { OPT_TRUE, "tolower", &toLower, "map vocabulary to lowercase" },
    { OPT_TRUE, "multiwords", &multiwords, "split multiwords for LM evaluation" },
    { OPT_STRING, "ppl", &pplFile, "text file to compute perplexity from" },
    { OPT_STRING, "escape", &escape, "escape prefix to pass data through -ppl" },
    { OPT_STRING, "counts", &countFile, "count file to compute perplexity from" },
    { OPT_TRUE, "counts-entropy", &countEntropy, "compute entropy (not perplexity) from counts" },
    { OPT_UINT, "count-order", &countOrder, "max count order used by -counts" },
    { OPT_UINT, "gen", &generateSents, "number of random sentences to generate" },
    { OPT_INT, "seed", &seed, "seed for randomization" },
    { OPT_STRING, "vocab", &vocabFile, "vocab file" },
    { OPT_STRING, "nonevents", &noneventFile, "non-event vocabulary" },
    { OPT_TRUE, "limit-vocab", &limitVocab, "limit LM reading to specified vocabulary" },
    { OPT_STRING, "lm", &lmFile, "file in ARPA LM format" },
    { OPT_UINT, "bayes", &bayesLength, "context length for Bayes mixture LM" },
    { OPT_FLOAT, "bayes-scale", &bayesScale, "log likelihood scale for -bayes" },
    { OPT_STRING, "mix-lm", &mixFile, "LM to mix in" },
    { OPT_FLOAT, "lambda", &mixLambda, "mixture weight for -lm" },
    { OPT_STRING, "mix-lm2", &mixFile2, "second LM to mix in" },
    { OPT_FLOAT, "mix-lambda2", &mixLambda2, "mixture weight for -mix-lm2" },
    { OPT_STRING, "mix-lm3", &mixFile3, "third LM to mix in" },
    { OPT_FLOAT, "mix-lambda3", &mixLambda3, "mixture weight for -mix-lm3" },
    { OPT_STRING, "mix-lm4", &mixFile4, "fourth LM to mix in" },
    { OPT_FLOAT, "mix-lambda4", &mixLambda4, "mixture weight for -mix-lm4" },
    { OPT_STRING, "mix-lm5", &mixFile5, "fifth LM to mix in" },
    { OPT_FLOAT, "mix-lambda5", &mixLambda5, "mixture weight for -mix-lm5" },
    { OPT_STRING, "mix-lm6", &mixFile6, "sixth LM to mix in" },
    { OPT_FLOAT, "mix-lambda6", &mixLambda6, "mixture weight for -mix-lm6" },
    { OPT_STRING, "mix-lm7", &mixFile7, "seventh LM to mix in" },
    { OPT_FLOAT, "mix-lambda7", &mixLambda7, "mixture weight for -mix-lm7" },
    { OPT_STRING, "mix-lm8", &mixFile8, "eighth LM to mix in" },
    { OPT_FLOAT, "mix-lambda8", &mixLambda8, "mixture weight for -mix-lm8" },
    { OPT_STRING, "mix-lm9", &mixFile9, "ninth LM to mix in" },
    { OPT_FLOAT, "mix-lambda9", &mixLambda9, "mixture weight for -mix-lm9" },
    { OPT_TRUE, "loglinear-mix", &loglinearMix, "use log-linear mixture LM" },
    { OPT_TRUE, "null", &null, "use a null language model" },
    { OPT_UINT, "cache", &cache, "history length for cache language model" },
    { OPT_FLOAT, "cache-lambda", &cacheLambda, "interpolation weight for -cache" },
    { OPT_TRUE, "dynamic", &dynamic, "interpolate with a dynamic lm" },
    { OPT_TRUE, "hmm", &hmm, "use HMM of n-grams model" },
    { OPT_TRUE, "adapt-mix", &adaptMix, "use adaptive mixture of n-grams model" },
    { OPT_FLOAT, "adapt-decay", &adaptDecay, "history likelihood decay factor" },
    { OPT_UINT, "adapt-iters", &adaptIters, "EM iterations for adaptive mix" },
    { OPT_STRING, "adapt-marginals", &adaptMarginals, "unigram marginals to adapt base LM to" },
    { OPT_STRING, "base-marginals", &baseMarginals, "unigram marginals of base LM to" },
    { OPT_FLOAT, "adapt-marginals-beta", &adaptMarginalsBeta, "marginals adaptation weight" },
    { OPT_TRUE, "adapt-marginals-ratios", &adaptMarginalsRatios, "compute ratios between marginals-adapted and base probs" },
    { OPT_FLOAT, "dynamic-lambda", &dynamicLambda, "interpolation weight for -dynamic" },
    { OPT_TRUE, "reverse", &reverseSents, "reverse words" },
    { OPT_STRING, "rescore-ngram", &rescoreNgramFile, "recompute probs in N-gram LM" },
    { OPT_STRING, "write-lm", &writeLM, "re-write LM to file" },
    { OPT_STRING, "write-vocab", &writeVocab, "write LM vocab to file" },
    { OPT_TRUE, "renorm", &renormalize, "renormalize backoff weights" },
    { OPT_FLOAT, "prune", &prune, "prune redundant probs" },
    { OPT_UINT, "minprune", &minprune, "prune only ngrams at least this long" },
    { OPT_TRUE, "prune-lowprobs", &pruneLowProbs, "low probability N-grams" },

    { OPT_TRUE, "memuse", &memuse, "show memory usage" },

    { OPT_STRING, "nbest", &nbestFile, "nbest list file to rescore" },
    { OPT_STRING, "nbest-files", &nbestFiles, "list of N-best filenames" },
    { OPT_TRUE, "split-multiwords", &splitMultiwords, "split multiwords in N-best lists" },
    { OPT_STRING, "write-nbest-dir", &writeNbestDir, "output directory for N-best rescoring" },
    { OPT_TRUE, "decipher-nbest", &writeDecipherNbest, "output Decipher n-best format" },
    { OPT_UINT, "max-nbest", &maxNbest, "maximum number of hyps to consider" },
    { OPT_TRUE, "no-reorder", &noReorder, "don't reorder N-best hyps after rescoring" },
    { OPT_STRING, "rescore", &rescoreFile, "hyp stream input file to rescore" },
    { OPT_STRING, "decipher-lm", &decipherLM, "DECIPHER(TM) LM for nbest list generation" },
    { OPT_UINT, "decipher-order", &decipherOrder, "ngram order for -decipher-lm" },
    { OPT_TRUE, "decipher-nobackoff", &decipherNoBackoff, "disable backoff hack in recognizer LM" },
    { OPT_FLOAT, "decipher-lmw", &decipherLMW, "DECIPHER(TM) LM weight" },
    { OPT_FLOAT, "decipher-wtw", &decipherWTW, "DECIPHER(TM) word transition weight" },
    { OPT_FLOAT, "rescore-lmw", &rescoreLMW, "rescoring LM weight" },
    { OPT_FLOAT, "rescore-wtw", &rescoreWTW, "rescoring word transition weight" },
    { OPT_STRING, "noise", &noiseTag, "noise tag to skip" },
    { OPT_STRING, "noise-vocab", &noiseVocabFile, "noise vocabulary to skip" },
};

/*
 * Rescore N-best list
 */
void
rescoreNbest(LM &lm, const char *inFilename, const char *outFilename)
{
    NBestList nbList(lm.vocab, maxNbest, splitMultiwords);

    File inlist(inFilename, "r");
    if (!nbList.read(inlist)) {
	cerr << "format error in nbest file\n";
	exit(1);
    }

    if (nbList.numHyps() == 0) {
	cerr << "warning: " << inFilename << " is empty, not rescored\n";
	return;
    }

    if (decipherLM) {
	/*
	 * decipherNoBackoff prevents the Decipher LM from simulating
	 * backoff paths when they score higher than direct probabilities.
	 */
	DecipherNgram oldLM(lm.vocab, decipherOrder, !decipherNoBackoff);

	oldLM.debugme(debug);

	File file(decipherLM, "r");

	if (!oldLM.read(file, limitVocab)) {
	    cerr << "format error in Decipher LM\n";
	    exit(1);
	}

	nbList.decipherFix(oldLM, decipherLMW, decipherWTW);
    }

    nbList.rescoreHyps(lm, rescoreLMW, rescoreWTW);

    if (!noReorder) {
	nbList.sortHyps();
    }

    if (outFilename) {
	File sout(outFilename, "w");
	nbList.write(sout, writeDecipherNbest);
    } else {
	File sout(stdout);
	nbList.write(sout, writeDecipherNbest);
    }
}

LM *
makeMixLM(const char *filename, Vocab &vocab, SubVocab *classVocab,
		    unsigned order, LM *oldLM, double lambda1, double lambda2)
{
    File file(filename, "r");

    /*
     * create factored LM if -factored was specified, 
     * class-ngram if -classes were specified,
     * and otherwise a regular ngram
     */
    Ngram *lm = factored ?
		  new ProductNgram((ProductVocab &)vocab, order) :
    		  (classVocab != 0) ?
		    (simpleClasses ?
			new SimpleClassNgram(vocab, *classVocab, order) :
		        new ClassNgram(vocab, *classVocab, order)) :
		    new Ngram(vocab, order);
    assert(lm != 0);

    lm->debugme(debug);
    lm->skipOOVs() = skipOOVs;

    if (!lm->read(file, limitVocab)) {
	cerr << "format error in mix-lm file " << filename << endl;
	exit(1);
    }

    /*
     * Each class LM needs to read the class definitions
     */
    if (classesFile != 0) {
	File file(classesFile, "r");
	((ClassNgram *)lm)->readClasses(file);
    }

    /*
     * Compute mixture lambda (make sure 0/0 = 0)
     */
    Prob lambda = (lambda1 == 0.0) ? 0.0 : lambda1/lambda2;

    if (oldLM == 0) {
	return lm;
    } else if (loglinearMix) {
	/*
	 * log-linear mixture
	 */
	LM *newLM = new LoglinearMix(vocab, *lm, *oldLM, lambda);
	assert(newLM != 0);

	newLM->debugme(debug);

	return newLM;
    } else if (bayesLength < 0) {
	/*
	 * static mixture
	 */
	((Ngram *)oldLM)->mixProbs(*lm, 1.0 - lambda);
	delete lm;

	return oldLM;
    } else {
	/*
	 * dymamic Bayesian mixture
	 */
	LM *newLM = new BayesMix(vocab, *lm, *oldLM,
					bayesLength, lambda, bayesScale);
	assert(newLM != 0);

	newLM->debugme(debug);

	return newLM;
    }
}

int
main(int argc, char **argv)
{
    setlocale(LC_CTYPE, "");
    setlocale(LC_COLLATE, "");

    /* set default seed for randomization */
#ifndef _MSC_VER
    seed = time(NULL) + getpid();
#else
	seed = time(NULL);
#endif

    Opt_Parse(argc, argv, options, Opt_Number(options), 0);

    if (version) {
	printVersion(RcsId);
	exit(0);
    }

    if (hmm + adaptMix + decipherHack + tagged +
	skipNgram + hiddenS + df + factored + (hiddenVocabFile != 0) +
	(classesFile != 0 || expandClasses >= 0) + (stopWordFile != 0) > 1)
    {
	cerr << "HMM, AdaptiveMix, Decipher, tagged, factored, DF, hidden N-gram, hidden-S, class N-gram, skip N-gram and stop-word N-gram models are mutually exclusive\n";
	exit(2);
    }

    /*
     * Set random seed
     */
    srand48((long)seed);

    /*
     * Construct language model
     */

    Vocab *vocab;
    Ngram *ngramLM;
    LM *useLM;

    if (factored + tagged + multiwords > 1) {
	cerr << "factored, tagged, and multiword vocabularies are mutually exclusive\n";
	exit(2);
    }

    vocab = tagged ? new TaggedVocab :
		multiwords ? new MultiwordVocab :
		      factored ? new ProductVocab :
   			          new Vocab;
    assert(vocab != 0);

    vocab->unkIsWord() = keepunk ? true : false;
    vocab->toLower() = toLower ? true : false;

    if (factored) {
	((ProductVocab *)vocab)->nullIsWord() = keepnull ? true : false;
    }

    /*
     * Change unknown word string if requested
     */
    if (mapUnknown) {
	vocab->remove(vocab->unkIndex());
	vocab->unkIndex() = vocab->addWord(mapUnknown);
    }

    if (vocabFile) {
	File file(vocabFile, "r");
	vocab->read(file);
    }

    if (noneventFile) {
	/*
	 * create temporary sub-vocabulary for non-event words
	 */
	SubVocab nonEvents(*vocab);

	File file(noneventFile, "r");
	nonEvents.read(file);

	vocab->addNonEvents(nonEvents);
    }

    SubVocab *stopWords = 0;
    if (stopWordFile != 0) {
	stopWords = new SubVocab(*vocab);
	assert(stopWords);

	File file(stopWordFile, "r");
	stopWords->read(file);
    }

    SubVocab *hiddenEvents = 0;
    if (hiddenVocabFile != 0) {
	hiddenEvents = new SubVocab(*vocab);
	assert(hiddenEvents);

	File file(hiddenVocabFile, "r");
	hiddenEvents->read(file);
    }

    SubVocab *classVocab = 0;
    if (classesFile != 0 || expandClasses >= 0) {
	classVocab = new SubVocab(*vocab);
	assert(classVocab);

	/*
	 * limitVocab on class N-grams only works if the classes are 
	 * in the vocabulary at read time.  We ensure this by reading 
	 * the class names (the first column of the class definitions)
	 * into the vocabulary.
	 */
	if (limitVocab) {
	    File file(classesFile, "r");
	    classVocab->read(file);
	}
    }

    ngramLM =
       decipherHack ? new DecipherNgram(*vocab, order, !decipherNoBackoff) :
	 df ? new DFNgram(*vocab, order) :
	   skipNgram ? new SkipNgram(*vocab, order) :
	     hiddenS ? new HiddenSNgram(*vocab, order) :
	       tagged ? new TaggedNgram(*(TaggedVocab *)vocab, order) :
	        factored ? new ProductNgram(*(ProductVocab *)vocab, order) :
		 (stopWordFile != 0) ? new StopNgram(*vocab, *stopWords, order):
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -