📄 fngram.cc

📁 这是一款很好用的工具包
💻 CC
字号:
/* * fngram -- *	Create and manipulate fngram models * */#ifndef lintstatic char Copyright[] = "Copyright (c) 1995-2006 SRI International.  All Rights Reserved.";static char RcsId[] = "@(#)$Id: fngram.cc,v 1.72 2006/01/09 18:33:31 stolcke Exp $";#endif#include <iostream>using namespace std;#include <stdio.h>#include <stdlib.h>#include <locale.h>#include <assert.h>#ifndef _MSC_VER#include <unistd.h>#endif#include <time.h>#ifndef EXCLUDE_CONTRIBextern "C" {	void srand48(long);           /* might be missing from math.h or stdlib.h */}#include "option.h"#include "version.h"#include "File.h"#include "Vocab.h"#include "SubVocab.h"#include "NBest.h"#include "Ngram.h"#include "FNgram.h"#include "FNgramStats.cc"#include "FNgramSpecs.cc"#include "NullLM.h"#include "DecipherNgram.h"#include "hexdec.h"static int version = 0;static unsigned order = defaultNgramOrder;static unsigned debug = 0;static char *pplFile = 0;static char *escape = 0;static char *countFile = 0;static unsigned countOrder = 0;static char *vocabFile = 0;static char *nonEvent = 0;static char *noneventFile = 0;static int reverseSents = 0;static int writeLM  = 0;static char *writeVocab  = 0;static int memuse = 0;static int skipOOVs = 0;static int seed = 0;  /* default dynamically generated in main() */static int toLower = 0;static int keepunk = 0;static int keepnull = 1;static char *noiseTag = 0;static char *noiseVocabFile = 0;static int virtualBeginSentence = 1;static int virtualEndSentence = 1;static int noScoreSentenceBoundaryMarks = 0;/* * N-Best related variables */static char *factorFile = 0;static char *nbestFile = 0;static unsigned maxNbest = 0;static char *rescoreFile = 0;static double rescoreLMW = 8.0;static double rescoreWTW = 0.0;static int combineLMScores = 1;static Option options[] = {    { OPT_TRUE, "version", &version, "print version information" },    { OPT_STRING, "factor-file", &factorFile, "build a factored LM, use factors given in file" },    { OPT_UINT, "debug", &debug, "debugging level for lm" },    { OPT_TRUE, "skipoovs", &skipOOVs, "skip n-gram contexts containing OOVs" },    { OPT_TRUE, "unk", &keepunk, "vocabulary contains <unk>" },    { OPT_FALSE, "nonull", &keepnull, "remove <NULL> in LM" },    { OPT_TRUE, "tolower", &toLower, "map vocabulary to lowercase" },    { OPT_STRING, "ppl", &pplFile, "text file to compute perplexity from" },    { OPT_STRING, "escape", &escape, "escape prefix to pass data through -ppl" },    { OPT_INT, "seed", &seed, "seed for randomization" },    { OPT_STRING, "vocab", &vocabFile, "vocab file" },    { OPT_STRING, "non-event", &nonEvent, "non-event word" },    { OPT_STRING, "nonevents", &noneventFile, "non-event vocabulary file" },    { OPT_FALSE, "no-virtual-begin-sentence", &virtualBeginSentence, "Do *not* use a virtual start sentence context at the sentence begin"},    { OPT_FALSE, "no-virtual-end-sentence", &virtualEndSentence, "Do *not* use a virtual end sentence context at the sentence end"},    { OPT_TRUE, "no-score-sentence-marks", &noScoreSentenceBoundaryMarks, "Do *not* score the sentence boundary marks <s> </s>, score only words in-between"},    { OPT_TRUE, "write-lm", &writeLM, "re-write LM to file" },    { OPT_STRING, "write-vocab", &writeVocab, "write LM vocab to file" },    // not currently implemented    //    { OPT_TRUE, "memuse", &memuse, "show memory usage" },    { OPT_STRING, "rescore", &rescoreFile, "hyp stream input file to rescore" },    { OPT_FALSE, "separate-lm-scores", &combineLMScores, "print separate lm scores in n-best file" },    { OPT_FLOAT, "rescore-lmw", &rescoreLMW, "rescoring LM weight" },    { OPT_FLOAT, "rescore-wtw", &rescoreWTW, "rescoring word transition weight" },    { OPT_STRING, "noise", &noiseTag, "noise tag to skip" },    { OPT_STRING, "noise-vocab", &noiseVocabFile, "noise vocabulary to skip" },};intmain(int argc, char **argv){    setlocale(LC_CTYPE, "");    setlocale(LC_COLLATE, "");    /* set default seed for randomization */#ifndef _MSC_VER    seed = time(NULL) + getpid();#else	seed = time(NULL);#endif    // print 0x in front of hex numbers.    SHOWBASE(cout);    SHOWBASE(cerr);    Opt_Parse(argc, argv, options, Opt_Number(options), 0);    if (version) {	printVersion(RcsId);	exit(0);    }    /*     * Set random seed     */    srand48((long)seed);    /*     * Construct language model     */    if (factorFile == 0) {	fprintf(stderr,"Error: must specify factor file\n");	exit(-1);    }    FactoredVocab *vocab = new FactoredVocab;    assert(vocab != 0);    FNgramSpecs<FNgramCount>* fnSpecs = 0;     File f(factorFile,"r");    fnSpecs = new FNgramSpecs<FNgramCount>(f,*vocab,debug);    if (!fnSpecs) {	fprintf(stderr,"Error creating fnspecs object");	exit(-1);    }        vocab->unkIsWord() = keepunk ? true : false;    vocab->nullIsWord() = keepnull ? true : false;    vocab->toLower() = toLower ? true : false;    // for now, load in the stats object since we need the counts    // to make decisions. Ultimately, this will be entirely contained    // within the LM file.    FNgramStats *factoredStats = new FNgramStats(*vocab, *fnSpecs);    assert(factoredStats != 0);    factoredStats->debugme(debug);    if (vocabFile) {	File file(vocabFile, "r");	factoredStats->vocab.read(file);	factoredStats->openVocab = false;    }    if (noneventFile) {	/*	 * create temporary sub-vocabulary for non-event words	 */	SubVocab nonEvents(*vocab);	File file(noneventFile, "r");	nonEvents.read(file);	vocab->addNonEvents(nonEvents);    }    if (nonEvent) {	vocab->addNonEvent(nonEvent);    }    FNgram* fngramLM = new FNgram(*vocab,*fnSpecs);    assert(fngramLM != 0);    fngramLM->debugme(debug);    fngramLM->virtualBeginSentence = virtualBeginSentence ? true : false;    fngramLM->virtualEndSentence = virtualEndSentence ? true : false;    fngramLM->noScoreSentenceBoundaryMarks = noScoreSentenceBoundaryMarks ? true : false;    if (skipOOVs) {	fngramLM->skipOOVs = true;    }    /*     * Read just a single LM     */    // readin the counts, we need to do this for now.    // TODO: change so that counts are not needed for ppl/rescoring.    if (!factoredStats->read()) {	cerr << "error reading in counts in factor file\n";	exit(1);    }    // We need to do this here so that we get the    // same GBO strategy that we got when the LM was estimated.    // TODO: put the resulting counts used when the LM was trained    // into the lm file, making the LM file self contained.    factoredStats->estimateDiscounts();    factoredStats->computeCardinalityFunctions();    factoredStats->sumCounts();    if (!fngramLM->read()) {	cerr << "format error in lm file\n";	exit(1);    }    /*     * Reverse words in scoring     */    if (reverseSents) {	fngramLM->reverseWords = true;    }    /*     * Skip noise tags in scoring     */    if (noiseVocabFile) {	File file(noiseVocabFile, "r");	fngramLM->noiseVocab.read(file);    }    if (noiseTag) {				/* backward compatibility */	fngramLM->noiseVocab.addWord(noiseTag);    }    if (memuse && 0) {      // TODO: get all the memuse stuff working in all factored model code	MemStats memuse;	fngramLM->memStats(memuse);	memuse.print();    }    /*     * Compute perplexity on a text file, if requested     */    if (pplFile) {	File file(pplFile, "r");	TextStats stats;	/*	 * Send perplexity info to stdout 	 */	fngramLM->dout(cout);	fngramLM->pplFile(file, stats, escape);	fngramLM->pplPrint(cout, pplFile);	fngramLM->dout(cerr);    }    /*     * Compute perplexity on a count file, if requested     */    if (countFile && 0) { // TODO: not yet implemented	TextStats stats;	File file(countFile, "r");	/*	 * Send perplexity info to stdout 	 */	fngramLM->dout(cout);	fngramLM->pplCountsFile(file, countOrder ? countOrder : order,							    stats, escape);	fngramLM->dout(cerr);	cout << "file " << countFile << ": " << stats;    }    // TODO: add generate option.    /*     * Rescore stream of N-best hyps, if requested     */    if (rescoreFile) {	File file(rescoreFile, "r");	NullLM nullLM(fngramLM->vocab);	fngramLM->combineLMScores = combineLMScores;	fngramLM->rescoreFile(file, rescoreLMW, rescoreWTW, 			      nullLM, 0, 0,			      escape);    }    if (writeLM) {	fngramLM->write();    }    if (writeVocab) {	File file(writeVocab, "w");	vocab->write(file);    }#ifdef DEBUG    delete fngramLM;    delete vocab;    delete fnSpecs;    delete factoredStats;    return 0;#endif /* DEBUG */    exit(0);}#else /* EXCLUDE_CONTRIB_END */intmain(int argc, char **argv){    cerr << "Third-party FLM support not included.\n";    exit(1);}#endif /* INCLUDE_CONTRIB */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -