📄 ngram-count.cc
字号:
/*
* ngram-count --
* Create and manipulate word ngram counts
*
*/
#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2004 SRI International. All Rights Reserved.";
static char RcsId[] = "@(#)$Id: ngram-count.cc,v 1.54 2006/01/05 20:21:27 stolcke Exp $";
#endif
#include <iostream>
using namespace std;
#include <stdlib.h>
#include <locale.h>
#include <assert.h>
#include "option.h"
#include "version.h"
#include "File.h"
#include "Vocab.h"
#include "SubVocab.h"
#include "Ngram.h"
#include "VarNgram.h"
#include "TaggedNgram.h"
#include "SkipNgram.h"
#include "StopNgram.h"
#include "NgramStats.cc"
#include "TaggedNgramStats.h"
#include "StopNgramStats.h"
#include "Discount.h"
#include "Array.cc"
const unsigned maxorder = 9; /* this is only relevant to the
* the -gt<n> and -write<n> flags */
static int version = 0;
static char *filetag = 0;
static unsigned order = 3;
static unsigned debug = 0;
static char *textFile = 0;
static char *readFile = 0;
static int readWithMincounts = 0;
static unsigned writeOrder = 0; /* default is all ngram orders */
static char *writeFile[maxorder+1];
static unsigned gtmin[maxorder+1] = {1, 1, 1, 2, 2, 2, 2, 2, 2, 2};
static unsigned gtmax[maxorder+1] = {5, 1, 7, 7, 7, 7, 7, 7, 7, 7};
static double cdiscount[maxorder+1] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
static int ndiscount[maxorder+1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static int wbdiscount[maxorder+1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static int kndiscount[maxorder+1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static int ukndiscount[maxorder+1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static int knCountsModified = 0;
static int knCountsModifyAtEnd = 0;
static int interpolate[maxorder+1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static char *gtFile[maxorder+1];
static char *knFile[maxorder+1];
static char *lmFile = 0;
static char *initLMFile = 0;
static char *vocabFile = 0;
static char *noneventFile = 0;
static char *writeVocab = 0;
static int memuse = 0;
static int recompute = 0;
static int sortNgrams = 0;
static int keepunk = 0;
static char *mapUnknown = 0;
static int tagged = 0;
static int toLower = 0;
static int trustTotals = 0;
static double prune = 0.0;
static unsigned minprune = 2;
static int useFloatCounts = 0;
static double varPrune = 0.0;
static int skipNgram = 0;
static double skipInit = 0.5;
static unsigned maxEMiters = 100;
static double minEMdelta = 0.001;
static char *stopWordFile = 0;
static char *metaTag = 0;
static Option options[] = {
{ OPT_TRUE, "version", &version, "print version information" },
{ OPT_UINT, "order", &order, "max ngram order" },
{ OPT_FLOAT, "varprune", &varPrune, "pruning threshold for variable order ngrams" },
{ OPT_UINT, "debug", &debug, "debugging level for LM" },
{ OPT_TRUE, "recompute", &recompute, "recompute lower-order counts by summation" },
{ OPT_TRUE, "sort", &sortNgrams, "sort ngrams output" },
{ OPT_UINT, "write-order", &writeOrder, "output ngram counts order" },
{ OPT_STRING, "tag", &filetag, "file tag to use in messages" },
{ OPT_STRING, "text", &textFile, "text file to read" },
{ OPT_STRING, "read", &readFile, "counts file to read" },
{ OPT_TRUE, "read-with-mincounts", &readWithMincounts, "apply minimum counts when reading counts file" },
{ OPT_STRING, "write", &writeFile[0], "counts file to write" },
{ OPT_STRING, "write1", &writeFile[1], "1gram counts file to write" },
{ OPT_STRING, "write2", &writeFile[2], "2gram counts file to write" },
{ OPT_STRING, "write3", &writeFile[3], "3gram counts file to write" },
{ OPT_STRING, "write4", &writeFile[4], "4gram counts file to write" },
{ OPT_STRING, "write5", &writeFile[5], "5gram counts file to write" },
{ OPT_STRING, "write6", &writeFile[6], "6gram counts file to write" },
{ OPT_STRING, "write7", &writeFile[7], "7gram counts file to write" },
{ OPT_STRING, "write8", &writeFile[8], "8gram counts file to write" },
{ OPT_STRING, "write9", &writeFile[9], "9gram counts file to write" },
{ OPT_UINT, "gtmin", >min[0], "lower GT discounting cutoff" },
{ OPT_UINT, "gtmax", >max[0], "upper GT discounting cutoff" },
{ OPT_UINT, "gt1min", >min[1], "lower 1gram discounting cutoff" },
{ OPT_UINT, "gt1max", >max[1], "upper 1gram discounting cutoff" },
{ OPT_UINT, "gt2min", >min[2], "lower 2gram discounting cutoff" },
{ OPT_UINT, "gt2max", >max[2], "upper 2gram discounting cutoff" },
{ OPT_UINT, "gt3min", >min[3], "lower 3gram discounting cutoff" },
{ OPT_UINT, "gt3max", >max[3], "upper 3gram discounting cutoff" },
{ OPT_UINT, "gt4min", >min[4], "lower 4gram discounting cutoff" },
{ OPT_UINT, "gt4max", >max[4], "upper 4gram discounting cutoff" },
{ OPT_UINT, "gt5min", >min[5], "lower 5gram discounting cutoff" },
{ OPT_UINT, "gt5max", >max[5], "upper 5gram discounting cutoff" },
{ OPT_UINT, "gt6min", >min[6], "lower 6gram discounting cutoff" },
{ OPT_UINT, "gt6max", >max[6], "upper 6gram discounting cutoff" },
{ OPT_UINT, "gt7min", >min[7], "lower 7gram discounting cutoff" },
{ OPT_UINT, "gt7max", >max[7], "upper 7gram discounting cutoff" },
{ OPT_UINT, "gt8min", >min[8], "lower 8gram discounting cutoff" },
{ OPT_UINT, "gt8max", >max[8], "upper 8gram discounting cutoff" },
{ OPT_UINT, "gt9min", >min[9], "lower 9gram discounting cutoff" },
{ OPT_UINT, "gt9max", >max[9], "upper 9gram discounting cutoff" },
{ OPT_STRING, "gt", >File[0], "Good-Turing discount parameter file" },
{ OPT_STRING, "gt1", >File[1], "Good-Turing 1gram discounts" },
{ OPT_STRING, "gt2", >File[2], "Good-Turing 2gram discounts" },
{ OPT_STRING, "gt3", >File[3], "Good-Turing 3gram discounts" },
{ OPT_STRING, "gt4", >File[4], "Good-Turing 4gram discounts" },
{ OPT_STRING, "gt5", >File[5], "Good-Turing 5gram discounts" },
{ OPT_STRING, "gt6", >File[6], "Good-Turing 6gram discounts" },
{ OPT_STRING, "gt7", >File[7], "Good-Turing 7gram discounts" },
{ OPT_STRING, "gt8", >File[8], "Good-Turing 8gram discounts" },
{ OPT_STRING, "gt9", >File[9], "Good-Turing 9gram discounts" },
{ OPT_FLOAT, "cdiscount", &cdiscount[0], "discounting constant" },
{ OPT_FLOAT, "cdiscount1", &cdiscount[1], "1gram discounting constant" },
{ OPT_FLOAT, "cdiscount2", &cdiscount[2], "2gram discounting constant" },
{ OPT_FLOAT, "cdiscount3", &cdiscount[3], "3gram discounting constant" },
{ OPT_FLOAT, "cdiscount4", &cdiscount[4], "4gram discounting constant" },
{ OPT_FLOAT, "cdiscount5", &cdiscount[5], "5gram discounting constant" },
{ OPT_FLOAT, "cdiscount6", &cdiscount[6], "6gram discounting constant" },
{ OPT_FLOAT, "cdiscount7", &cdiscount[7], "7gram discounting constant" },
{ OPT_FLOAT, "cdiscount8", &cdiscount[8], "8gram discounting constant" },
{ OPT_FLOAT, "cdiscount9", &cdiscount[9], "9gram discounting constant" },
{ OPT_TRUE, "ndiscount", &ndiscount[0], "use natural discounting" },
{ OPT_TRUE, "ndiscount1", &ndiscount[1], "1gram natural discounting" },
{ OPT_TRUE, "ndiscount2", &ndiscount[2], "2gram natural discounting" },
{ OPT_TRUE, "ndiscount3", &ndiscount[3], "3gram natural discounting" },
{ OPT_TRUE, "ndiscount4", &ndiscount[4], "4gram natural discounting" },
{ OPT_TRUE, "ndiscount5", &ndiscount[5], "5gram natural discounting" },
{ OPT_TRUE, "ndiscount6", &ndiscount[6], "6gram natural discounting" },
{ OPT_TRUE, "ndiscount7", &ndiscount[7], "7gram natural discounting" },
{ OPT_TRUE, "ndiscount8", &ndiscount[8], "8gram natural discounting" },
{ OPT_TRUE, "ndiscount9", &ndiscount[9], "9gram natural discounting" },
{ OPT_TRUE, "wbdiscount", &wbdiscount[0], "use Witten-Bell discounting" },
{ OPT_TRUE, "wbdiscount1", &wbdiscount[1], "1gram Witten-Bell discounting"},
{ OPT_TRUE, "wbdiscount2", &wbdiscount[2], "2gram Witten-Bell discounting"},
{ OPT_TRUE, "wbdiscount3", &wbdiscount[3], "3gram Witten-Bell discounting"},
{ OPT_TRUE, "wbdiscount4", &wbdiscount[4], "4gram Witten-Bell discounting"},
{ OPT_TRUE, "wbdiscount5", &wbdiscount[5], "5gram Witten-Bell discounting"},
{ OPT_TRUE, "wbdiscount6", &wbdiscount[6], "6gram Witten-Bell discounting"},
{ OPT_TRUE, "wbdiscount7", &wbdiscount[7], "7gram Witten-Bell discounting"},
{ OPT_TRUE, "wbdiscount8", &wbdiscount[8], "8gram Witten-Bell discounting"},
{ OPT_TRUE, "wbdiscount9", &wbdiscount[9], "9gram Witten-Bell discounting"},
{ OPT_TRUE, "kndiscount", &kndiscount[0], "use modified Kneser-Ney discounting" },
{ OPT_TRUE, "kndiscount1", &kndiscount[1], "1gram modified Kneser-Ney discounting"},
{ OPT_TRUE, "kndiscount2", &kndiscount[2], "2gram modified Kneser-Ney discounting"},
{ OPT_TRUE, "kndiscount3", &kndiscount[3], "3gram modified Kneser-Ney discounting"},
{ OPT_TRUE, "kndiscount4", &kndiscount[4], "4gram modified Kneser-Ney discounting"},
{ OPT_TRUE, "kndiscount5", &kndiscount[5], "5gram modified Kneser-Ney discounting"},
{ OPT_TRUE, "kndiscount6", &kndiscount[6], "6gram modified Kneser-Ney discounting"},
{ OPT_TRUE, "kndiscount7", &kndiscount[7], "7gram modified Kneser-Ney discounting"},
{ OPT_TRUE, "kndiscount8", &kndiscount[8], "8gram modified Kneser-Ney discounting"},
{ OPT_TRUE, "kndiscount9", &kndiscount[9], "9gram modified Kneser-Ney discounting"},
{ OPT_TRUE, "ukndiscount", &ukndiscount[0], "use original Kneser-Ney discounting" },
{ OPT_TRUE, "ukndiscount1", &ukndiscount[1], "1gram original Kneser-Ney discounting"},
{ OPT_TRUE, "ukndiscount2", &ukndiscount[2], "2gram original Kneser-Ney discounting"},
{ OPT_TRUE, "ukndiscount3", &ukndiscount[3], "3gram original Kneser-Ney discounting"},
{ OPT_TRUE, "ukndiscount4", &ukndiscount[4], "4gram original Kneser-Ney discounting"},
{ OPT_TRUE, "ukndiscount5", &ukndiscount[5], "5gram original Kneser-Ney discounting"},
{ OPT_TRUE, "ukndiscount6", &ukndiscount[6], "6gram original Kneser-Ney discounting"},
{ OPT_TRUE, "ukndiscount7", &ukndiscount[7], "7gram original Kneser-Ney discounting"},
{ OPT_TRUE, "ukndiscount8", &ukndiscount[8], "8gram original Kneser-Ney discounting"},
{ OPT_TRUE, "ukndiscount9", &ukndiscount[9], "9gram original Kneser-Ney discounting"},
{ OPT_STRING, "kn", &knFile[0], "Kneser-Ney discount parameter file" },
{ OPT_STRING, "kn1", &knFile[1], "Kneser-Ney 1gram discounts" },
{ OPT_STRING, "kn2", &knFile[2], "Kneser-Ney 2gram discounts" },
{ OPT_STRING, "kn3", &knFile[3], "Kneser-Ney 3gram discounts" },
{ OPT_STRING, "kn4", &knFile[4], "Kneser-Ney 4gram discounts" },
{ OPT_STRING, "kn5", &knFile[5], "Kneser-Ney 5gram discounts" },
{ OPT_STRING, "kn6", &knFile[6], "Kneser-Ney 6gram discounts" },
{ OPT_STRING, "kn7", &knFile[7], "Kneser-Ney 7gram discounts" },
{ OPT_STRING, "kn8", &knFile[8], "Kneser-Ney 8gram discounts" },
{ OPT_STRING, "kn9", &knFile[9], "Kneser-Ney 9gram discounts" },
{ OPT_TRUE, "kn-counts-modified", &knCountsModified, "input counts already modified for KN smoothing"},
{ OPT_TRUE, "kn-modify-counts-at-end", &knCountsModifyAtEnd, "modify counts after discount estimation rather than before"},
{ OPT_TRUE, "interpolate", &interpolate[0], "use interpolated estimates"},
{ OPT_TRUE, "interpolate1", &interpolate[1], "use interpolated 1gram estimates"},
{ OPT_TRUE, "interpolate2", &interpolate[2], "use interpolated 2gram estimates"},
{ OPT_TRUE, "interpolate3", &interpolate[3], "use interpolated 3gram estimates"},
{ OPT_TRUE, "interpolate4", &interpolate[4], "use interpolated 4gram estimates"},
{ OPT_TRUE, "interpolate5", &interpolate[5], "use interpolated 5gram estimates"},
{ OPT_TRUE, "interpolate6", &interpolate[6], "use interpolated 6gram estimates"},
{ OPT_TRUE, "interpolate7", &interpolate[7], "use interpolated 7gram estimates"},
{ OPT_TRUE, "interpolate8", &interpolate[8], "use interpolated 8gram estimates"},
{ OPT_TRUE, "interpolate9", &interpolate[9], "use interpolated 9gram estimates"},
{ OPT_STRING, "lm", &lmFile, "LM to estimate" },
{ OPT_STRING, "init-lm", &initLMFile, "initial LM for EM estimation" },
{ OPT_TRUE, "unk", &keepunk, "keep <unk> in LM" },
{ OPT_STRING, "map-unk", &mapUnknown, "word to map unknown words to" },
{ OPT_STRING, "meta-tag", &metaTag, "meta tag used to input count-of-count information" },
{ OPT_TRUE, "float-counts", &useFloatCounts, "use fractional counts" },
{ OPT_TRUE, "tagged", &tagged, "build a tagged LM" },
{ OPT_TRUE, "skip", &skipNgram, "build a skip N-gram LM" },
{ OPT_FLOAT, "skip-init", &skipInit, "default initial skip probability" },
{ OPT_UINT, "em-iters", &maxEMiters, "max number of EM iterations" },
{ OPT_FLOAT, "em-delta", &minEMdelta, "min log likelihood delta for EM" },
{ OPT_STRING, "stop-words", &stopWordFile, "stop-word vocabulary for stop-Ngram LM" },
{ OPT_TRUE, "tolower", &toLower, "map vocabulary to lowercase" },
{ OPT_TRUE, "trust-totals", &trustTotals, "trust lower-order counts for estimation" },
{ OPT_FLOAT, "prune", &prune, "prune redundant probs" },
{ OPT_UINT, "minprune", &minprune, "prune only ngrams at least this long" },
{ OPT_STRING, "vocab", &vocabFile, "vocab file" },
{ OPT_STRING, "nonevents", &noneventFile, "non-event vocabulary" },
{ OPT_STRING, "write-vocab", &writeVocab, "write vocab to file" },
{ OPT_TRUE, "memuse", &memuse, "show memory usage" },
{ OPT_DOC, 0, 0, "the default action is to write counts to stdout" }
};
int
main(int argc, char **argv)
{
setlocale(LC_CTYPE, "");
setlocale(LC_COLLATE, "");
Boolean written = false;
Opt_Parse(argc, argv, options, Opt_Number(options), 0);
if (version) {
printVersion(RcsId);
exit(0);
}
if (useFloatCounts + tagged + skipNgram +
(stopWordFile != 0) + (varPrune != 0.0) > 1)
{
cerr << "fractional counts, variable, tagged, stop-word Ngram and skip N-gram models are mutually exclusive\n";
exit(2);
}
/*
* Detect inconsistent discounting options
*/
if (ndiscount[0] +
wbdiscount[0] +
(cdiscount[0] != -1.0) +
ukndiscount[0] +
(knFile[0] != 0 || kndiscount[0]) +
(gtFile[0] != 0) > 1)
{
cerr << "conflicting default discounting options\n";
exit(2);
}
Vocab *vocab = tagged ? new TaggedVocab : new Vocab;
assert(vocab);
vocab->unkIsWord() = keepunk ? true : false;
vocab->toLower() = toLower ? true : false;
/*
* Change unknown word string if requested
*/
if (mapUnknown) {
vocab->remove(vocab->unkIndex());
vocab->unkIndex() = vocab->addWord(mapUnknown);
}
/*
* Meta tag is used to input count-of-count information
*/
if (metaTag) {
vocab->metaTag() = metaTag;
}
SubVocab *stopWords = 0;
if (stopWordFile != 0) {
stopWords = new SubVocab(*vocab);
assert(stopWords);
}
/*
* The skip-ngram model requires count order one higher than
* the normal model.
*/
NgramStats *intStats =
(stopWords != 0) ? new StopNgramStats(*vocab, *stopWords, order) :
tagged ? new TaggedNgramStats(*(TaggedVocab *)vocab, order) :
useFloatCounts ? 0 :
new NgramStats(*vocab, skipNgram ? order + 1 : order);
NgramCounts<FloatCount> *floatStats =
!useFloatCounts ? 0 :
new NgramCounts<FloatCount>(*vocab, order);
#define USE_STATS(what) (useFloatCounts ? floatStats->what : intStats->what)
if (useFloatCounts) {
assert(floatStats != 0);
} else {
assert(intStats != 0);
}
USE_STATS(debugme(debug));
if (vocabFile) {
File file(vocabFile, "r");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -