📄 ngram.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 2 页
字号:
上一页 12
		   (hiddenVocabFile != 0) ? new HiddenNgram(*vocab, *hiddenEvents, order, hiddenNot) :
		     (classVocab != 0) ? 
			(simpleClasses ?
			    new SimpleClassNgram(*vocab, *classVocab, order) :
			    new ClassNgram(*vocab, *classVocab, order)) :
		        new Ngram(*vocab, order);
    assert(ngramLM != 0);

    ngramLM->debugme(debug);

    if (skipOOVs) {
	ngramLM->skipOOVs() = true;
    }

    if (null) {
	useLM = new NullLM(*vocab);
	assert(useLM != 0);
    } else if (lmFile) {
	if (hmm) {
	    /*
	     * Read an HMM of Ngrams
	     */
	    File file(lmFile, "r");
	    HMMofNgrams *hmm = new HMMofNgrams(*vocab, order);

	    hmm->debugme(debug);

	    if (!hmm->read(file, limitVocab)) {
		cerr << "format error in lm file\n";
		exit(1);
	    }

	    useLM = hmm;
	} else if (adaptMix) {
	    /*
	     * Read an adaptive mixture of Ngrams
	     */
	    File file(lmFile, "r");
	    AdaptiveMix *lm = new AdaptiveMix(*vocab, adaptDecay,
							bayesScale, adaptIters);

	    lm->debugme(debug);

	    if (!lm->read(file, limitVocab)) {
		cerr << "format error in lm file\n";
		exit(1);
	    }

	    useLM = lm;
	} else {
	    /*
	     * Read just a single LM
	     */
	    File file(lmFile, "r");

	    if (!ngramLM->read(file, limitVocab)) {
		cerr << "format error in lm file\n";
		exit(1);
	    }

	    if (mixFile && !loglinearMix && bayesLength < 0) {
		/*
		 * perform static interpolation (ngram merging)
		 */
		double mixLambda1 = 1.0 - mixLambda - mixLambda2 - mixLambda3 -
					mixLambda4 - mixLambda5 - mixLambda6 -
					mixLambda7 - mixLambda8 - mixLambda9;

		ngramLM = (Ngram *)makeMixLM(mixFile, *vocab, classVocab,
					order, ngramLM, mixLambda1,
					mixLambda + mixLambda1);

		if (mixFile2) {
		    ngramLM = (Ngram *)makeMixLM(mixFile2, *vocab, classVocab,
					order, ngramLM, mixLambda2,
					mixLambda + mixLambda1 + mixLambda2);
		}
		if (mixFile3) {
		    ngramLM = (Ngram *)makeMixLM(mixFile3, *vocab, classVocab,
					order, ngramLM, mixLambda3,
					mixLambda + mixLambda1 + mixLambda2 +
					mixLambda3);
		}
		if (mixFile4) {
		    ngramLM = (Ngram *)makeMixLM(mixFile4, *vocab, classVocab,
					order, ngramLM, mixLambda4,
					mixLambda + mixLambda1 + mixLambda2 +
					mixLambda3 + mixLambda4);
		}
		if (mixFile5) {
		    ngramLM = (Ngram *)makeMixLM(mixFile5, *vocab, classVocab,
					order, ngramLM, mixLambda5,
					mixLambda + mixLambda1 + mixLambda2 +
					mixLambda3 + mixLambda4 + mixLambda5);
		}
		if (mixFile6) {
		    ngramLM = (Ngram *)makeMixLM(mixFile6, *vocab, classVocab,
					order, ngramLM, mixLambda6,
					mixLambda + mixLambda1 + mixLambda2 +
					mixLambda3 + mixLambda4 + mixLambda5 +
					mixLambda6);
		}
		if (mixFile7) {
		    ngramLM = (Ngram *)makeMixLM(mixFile7, *vocab, classVocab,
					order, ngramLM, mixLambda7,
					mixLambda + mixLambda1 + mixLambda2 +
					mixLambda3 + mixLambda4 + mixLambda5 +
					mixLambda6 + mixLambda7);
		}
		if (mixFile8) {
		    ngramLM = (Ngram *)makeMixLM(mixFile8, *vocab, classVocab,
					order, ngramLM, mixLambda8,
					mixLambda + mixLambda1 + mixLambda2 +
					mixLambda3 + mixLambda4 + mixLambda5 +
					mixLambda6 + mixLambda7 + mixLambda8);
		}
		if (mixFile9) {
		    ngramLM = (Ngram *)makeMixLM(mixFile9, *vocab, classVocab,
					order, ngramLM, mixLambda9, 1.0);
		}
	    }

	    /*
	     * Renormalize before the optional steps below, in case input
	     * model needs it, and because class expansion and pruning already
	     * include normalization.
	     */
	    if (renormalize) {
		ngramLM->recomputeBOWs();
	    }

	    /*
	     * Read class definitions from command line AFTER the LM, so
	     * they can override embedded class definitions.
	     */
	    if (classesFile != 0) {
		File file(classesFile, "r");
		((ClassNgram *)ngramLM)->readClasses(file);
	    }

	    if (expandClasses >= 0) {
		/*
		 * Replace class ngram with equivalent word ngram
		 * expandClasses == 0 generates all ngrams
		 * expandClasses > 0 generates only ngrams up to given length
		 */
		Ngram *newLM =
		    ((ClassNgram *)ngramLM)->expand(expandClasses, expandExact);

		newLM->debugme(debug);
		delete ngramLM;
		ngramLM = newLM;
	    }

	    if (prune != 0.0) {
		ngramLM->pruneProbs(prune, minprune);
	    }

	    if (pruneLowProbs) {
		ngramLM->pruneLowProbs(minprune);
	    }

	    useLM = ngramLM;
	}
    } else {
	cerr << "need at least an -lm file specified\n";
	exit(1);
    }

    if (mixFile && (loglinearMix || bayesLength >= 0)) {
	/*
	 * create a Bayes mixture LM 
	 */
	double mixLambda1 = 1.0 - mixLambda - mixLambda2 - mixLambda3 -
				mixLambda4 - mixLambda5 - mixLambda6 -
				mixLambda7 - mixLambda8 - mixLambda9;

	useLM = makeMixLM(mixFile, *vocab, classVocab, order, useLM,
				mixLambda1,
				mixLambda + mixLambda1);

	if (mixFile2) {
	    useLM = makeMixLM(mixFile2, *vocab, classVocab, order, useLM,
				mixLambda2,
				mixLambda + mixLambda1 + mixLambda2);
	}
	if (mixFile3) {
	    useLM = makeMixLM(mixFile3, *vocab, classVocab, order, useLM,
				mixLambda3,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3);
	}
	if (mixFile4) {
	    useLM = makeMixLM(mixFile4, *vocab, classVocab, order, useLM,
				mixLambda4,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4);
	}
	if (mixFile5) {
	    useLM = makeMixLM(mixFile5, *vocab, classVocab, order, useLM,
				mixLambda5,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5);
	}
	if (mixFile6) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda6,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5 +
				mixLambda6);
	}
	if (mixFile7) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda7,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5 +
				mixLambda6 + mixLambda7);
	}
	if (mixFile8) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda8,
				mixLambda + mixLambda1 + mixLambda2 +
				mixLambda3 + mixLambda4 + mixLambda5 +
				mixLambda6 + mixLambda7 + mixLambda8);
	}
	if (mixFile9) {
	    useLM = makeMixLM(mixFile6, *vocab, classVocab, order, useLM,
				mixLambda9, 1.0);
	}
    }

    if (cache > 0) {
	/*
	 * Create a mixture model with the cache lm as the second component
	 */
	CacheLM *cacheLM = new CacheLM(*vocab, cache);
	assert(cacheLM != 0);

	BayesMix *mixLM = new BayesMix(*vocab, *useLM, *cacheLM,
						0, 1.0 - cacheLambda, 0.0);
	assert(mixLM != 0);

        useLM = mixLM;
	useLM->debugme(debug);
    }

    if (dynamic) {
	/*
	 * Create a mixture model with the dynamic lm as the second component
	 */
	DynamicLM *dynamicLM = new DynamicLM(*vocab);
	assert(dynamicLM != 0);

	BayesMix *mixLM = new BayesMix(*vocab, *useLM, *dynamicLM,
						0, 1.0 - dynamicLambda, 0.0);
	assert(mixLM != 0);

        useLM = mixLM;
	useLM->debugme(debug);
    }

    if (adaptMarginals != 0) {
	/* 
	 * Adapt base LM to adaptive marginals given by unigram LM
	 */
	Ngram *adaptMargLM = new Ngram(*vocab, 1);
	assert(adaptMargLM != 0);

	{
	    File file(adaptMarginals, "r");
	    adaptMargLM->debugme(debug);
	    adaptMargLM->read(file);
	}

	LM *baseMargLM;

	if (baseMarginals == 0) {
	    baseMargLM = useLM;
	} else {
	    baseMargLM = new Ngram(*vocab, 1);
	    assert(baseMargLM != 0);

	    File file(baseMarginals, "r");
	    baseMargLM->debugme(debug);
	    baseMargLM->read(file);
	}

	AdaptiveMarginals *adaptLM =
		new AdaptiveMarginals(*vocab, *useLM, *baseMargLM,
					*adaptMargLM, adaptMarginalsBeta);
	if (adaptMarginalsRatios) {
	    adaptLM->computeRatios = true;
	}
	assert(adaptLM != 0);
	useLM = adaptLM;
	useLM->debugme(debug);
    }

    /*
     * Reverse words in scoring
     */
    if (reverseSents) {
	useLM->reverseWords = true;
    }

    /*
     * Skip noise tags in scoring
     */
    if (noiseVocabFile) {
	File file(noiseVocabFile, "r");
	useLM->noiseVocab.read(file);
    }
    if (noiseTag) {				/* backward compatibility */
	useLM->noiseVocab.addWord(noiseTag);
    }

    if (memuse) {
	MemStats memuse;
	useLM->memStats(memuse);
	memuse.print();
    }

    /*
     * Apply multiword wrapper if requested
     */
    if (multiwords) {
	useLM = new MultiwordLM(*(MultiwordVocab *)vocab, *useLM);
	assert(useLM != 0);

	useLM->debugme(debug);
    }

    /*
     * Rescore N-gram probs in LM file
     */
    if (rescoreNgramFile) {
	// create new vocab to avoid including class and multiwords
	// from the rescoring LM
	SubVocab *ngramVocab = new SubVocab(*vocab);
	assert(ngramVocab != 0);

	// read N-gram to be rescored
	Ngram *rescoreLM = new Ngram(*ngramVocab, order);
	assert(rescoreLM != 0);
	rescoreLM->debugme(debug);

	File file(rescoreNgramFile, "r");
	rescoreLM->read(file);

	rescoreLM->rescoreProbs(*useLM);

	// free memory for LMs used in rescoring
	if (ngramLM != useLM) {
	    delete ngramLM;
	    ngramLM = 0;
	}
	delete useLM;

	// use rescored LM below
	useLM = rescoreLM;
    }

    /*
     * Compute perplexity on a text file, if requested
     */
    if (pplFile) {
	File file(pplFile, "r");
	TextStats stats;

	/*
	 * Send perplexity info to stdout 
	 */
	useLM->dout(cout);
	useLM->pplFile(file, stats, escape);
	useLM->dout(cerr);

	cout << "file " << pplFile << ": " << stats;
    }

    /*
     * Compute perplexity on a count file, if requested
     */
    if (countFile) {
	TextStats stats;
	File file(countFile, "r");

	/*
	 * Send perplexity info to stdout 
	 */
	useLM->dout(cout);
	useLM->pplCountsFile(file, countOrder ? countOrder : order,
						stats, escape, countEntropy);
	useLM->dout(cerr);

	cout << "file " << countFile << ": " << stats;
    }

    /*
     * Rescore N-best list, if requested
     */
    if (nbestFile) {
	rescoreNbest(*useLM, nbestFile, NULL);
    }

    /*
     * Rescore multiple N-best lists
     */
    if (nbestFiles) {
	File file(nbestFiles, "r");

	char *line;
	while (line = file.getline()) {
	    char *fname = strtok(line, wordSeparators);
	    if (!fname) continue;

	    RefString sentid = idFromFilename(fname);

	    if (writeNbestDir) {
		makeArray(char, scoreFile,
			  strlen(writeNbestDir) + 1
				 + strlen(sentid) + strlen(GZIP_SUFFIX) + 1);
		sprintf(scoreFile, "%s/%s%s", writeNbestDir, sentid,
								GZIP_SUFFIX);
		rescoreNbest(*useLM, fname, scoreFile);
	    } else {
		rescoreNbest(*useLM, fname, NULL);
	    }
	}
    }

    /*
     * Rescore stream of N-best hyps, if requested
     */
    if (rescoreFile) {
	File file(rescoreFile, "r");

	LM *oldLM;

	if (decipherLM) {
	    oldLM =
		 new DecipherNgram(*vocab, decipherOrder, !decipherNoBackoff);
	    assert(oldLM != 0);

	    oldLM->debugme(debug);

	    File file(decipherLM, "r");

	    if (!oldLM->read(file, limitVocab)) {
		cerr << "format error in Decipher LM\n";
		exit(1);
	    }
	} else {
	    /*
	     * Create dummy LM for the sake of rescoreFile()
	     */
	    oldLM = new NullLM(*vocab);
	    assert(oldLM != 0);
	}

	useLM->rescoreFile(file, rescoreLMW, rescoreWTW,
				*oldLM, decipherLMW, decipherWTW, escape);

#ifdef DEBUG
	delete oldLM;
#endif
    }

    if (generateSents) {
	File outFile(stdout);
	unsigned i;

	for (i = 0; i < generateSents; i++) {
	    VocabString *sent = useLM->generateSentence(maxWordsPerLine,
							(VocabString *)0);
	    Vocab::write(outFile, sent);
	    putchar('\n');
	}
    }

    if (writeLM) {
	File file(writeLM, "w");
	useLM->write(file);
    }

    if (writeVocab) {
	File file(writeVocab, "w");
	vocab->write(file);
    }

#ifdef DEBUG
    if (&ngramLM->vocab != vocab) {
	delete &ngramLM->vocab;
    }
    if (ngramLM != useLM) {
	delete ngramLM;
    }
    delete useLM;

    delete stopWords;
    delete hiddenEvents;
    delete classVocab;
    delete vocab;

    return 0;
#endif /* DEBUG */

    exit(0);
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -