⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ngram-count.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 2 页
字号:
/*
 * ngram-count --
 *	Create and manipulate word ngram counts
 *
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2004 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Id: ngram-count.cc,v 1.54 2006/01/05 20:21:27 stolcke Exp $";
#endif

#include <iostream>
using namespace std;
#include <stdlib.h>
#include <locale.h>
#include <assert.h>

#include "option.h"
#include "version.h"
#include "File.h"
#include "Vocab.h"
#include "SubVocab.h"
#include "Ngram.h"
#include "VarNgram.h"
#include "TaggedNgram.h"
#include "SkipNgram.h"
#include "StopNgram.h"
#include "NgramStats.cc"
#include "TaggedNgramStats.h"
#include "StopNgramStats.h"
#include "Discount.h"
#include "Array.cc"

const unsigned maxorder = 9;		/* this is only relevant to the 
					 * the -gt<n> and -write<n> flags */
static int version = 0;
static char *filetag = 0;
static unsigned order = 3;
static unsigned debug = 0;
static char *textFile = 0;
static char *readFile = 0;
static int readWithMincounts = 0;

static unsigned writeOrder = 0;		/* default is all ngram orders */
static char *writeFile[maxorder+1];

static unsigned gtmin[maxorder+1] = {1, 1, 1, 2, 2, 2, 2, 2, 2, 2};
static unsigned gtmax[maxorder+1] = {5, 1, 7, 7, 7, 7, 7, 7, 7, 7};

static double cdiscount[maxorder+1] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
static int ndiscount[maxorder+1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static int wbdiscount[maxorder+1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static int kndiscount[maxorder+1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static int ukndiscount[maxorder+1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static int knCountsModified = 0;
static int knCountsModifyAtEnd = 0;
static int interpolate[maxorder+1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

static char *gtFile[maxorder+1];
static char *knFile[maxorder+1];
static char *lmFile = 0;
static char *initLMFile = 0;

static char *vocabFile = 0;
static char *noneventFile = 0;
static char *writeVocab = 0;
static int memuse = 0;
static int recompute = 0;
static int sortNgrams = 0;
static int keepunk = 0;
static char *mapUnknown = 0;
static int tagged = 0;
static int toLower = 0;
static int trustTotals = 0;
static double prune = 0.0;
static unsigned minprune = 2;
static int useFloatCounts = 0;

static double varPrune = 0.0;

static int skipNgram = 0;
static double skipInit = 0.5;
static unsigned maxEMiters = 100;
static double minEMdelta = 0.001;

static char *stopWordFile = 0;
static char *metaTag = 0;

static Option options[] = {
    { OPT_TRUE, "version", &version, "print version information" },
    { OPT_UINT, "order", &order, "max ngram order" },
    { OPT_FLOAT, "varprune", &varPrune, "pruning threshold for variable order ngrams" },
    { OPT_UINT, "debug", &debug, "debugging level for LM" },
    { OPT_TRUE, "recompute", &recompute, "recompute lower-order counts by summation" },
    { OPT_TRUE, "sort", &sortNgrams, "sort ngrams output" },
    { OPT_UINT, "write-order", &writeOrder, "output ngram counts order" },
    { OPT_STRING, "tag", &filetag, "file tag to use in messages" },
    { OPT_STRING, "text", &textFile, "text file to read" },
    { OPT_STRING, "read", &readFile, "counts file to read" },
    { OPT_TRUE, "read-with-mincounts", &readWithMincounts, "apply minimum counts when reading counts file" },

    { OPT_STRING, "write", &writeFile[0], "counts file to write" },
    { OPT_STRING, "write1", &writeFile[1], "1gram counts file to write" },
    { OPT_STRING, "write2", &writeFile[2], "2gram counts file to write" },
    { OPT_STRING, "write3", &writeFile[3], "3gram counts file to write" },
    { OPT_STRING, "write4", &writeFile[4], "4gram counts file to write" },
    { OPT_STRING, "write5", &writeFile[5], "5gram counts file to write" },
    { OPT_STRING, "write6", &writeFile[6], "6gram counts file to write" },
    { OPT_STRING, "write7", &writeFile[7], "7gram counts file to write" },
    { OPT_STRING, "write8", &writeFile[8], "8gram counts file to write" },
    { OPT_STRING, "write9", &writeFile[9], "9gram counts file to write" },

    { OPT_UINT, "gtmin", &gtmin[0], "lower GT discounting cutoff" },
    { OPT_UINT, "gtmax", &gtmax[0], "upper GT discounting cutoff" },
    { OPT_UINT, "gt1min", &gtmin[1], "lower 1gram discounting cutoff" },
    { OPT_UINT, "gt1max", &gtmax[1], "upper 1gram discounting cutoff" },
    { OPT_UINT, "gt2min", &gtmin[2], "lower 2gram discounting cutoff" },
    { OPT_UINT, "gt2max", &gtmax[2], "upper 2gram discounting cutoff" },
    { OPT_UINT, "gt3min", &gtmin[3], "lower 3gram discounting cutoff" },
    { OPT_UINT, "gt3max", &gtmax[3], "upper 3gram discounting cutoff" },
    { OPT_UINT, "gt4min", &gtmin[4], "lower 4gram discounting cutoff" },
    { OPT_UINT, "gt4max", &gtmax[4], "upper 4gram discounting cutoff" },
    { OPT_UINT, "gt5min", &gtmin[5], "lower 5gram discounting cutoff" },
    { OPT_UINT, "gt5max", &gtmax[5], "upper 5gram discounting cutoff" },
    { OPT_UINT, "gt6min", &gtmin[6], "lower 6gram discounting cutoff" },
    { OPT_UINT, "gt6max", &gtmax[6], "upper 6gram discounting cutoff" },
    { OPT_UINT, "gt7min", &gtmin[7], "lower 7gram discounting cutoff" },
    { OPT_UINT, "gt7max", &gtmax[7], "upper 7gram discounting cutoff" },
    { OPT_UINT, "gt8min", &gtmin[8], "lower 8gram discounting cutoff" },
    { OPT_UINT, "gt8max", &gtmax[8], "upper 8gram discounting cutoff" },
    { OPT_UINT, "gt9min", &gtmin[9], "lower 9gram discounting cutoff" },
    { OPT_UINT, "gt9max", &gtmax[9], "upper 9gram discounting cutoff" },

    { OPT_STRING, "gt", &gtFile[0], "Good-Turing discount parameter file" },
    { OPT_STRING, "gt1", &gtFile[1], "Good-Turing 1gram discounts" },
    { OPT_STRING, "gt2", &gtFile[2], "Good-Turing 2gram discounts" },
    { OPT_STRING, "gt3", &gtFile[3], "Good-Turing 3gram discounts" },
    { OPT_STRING, "gt4", &gtFile[4], "Good-Turing 4gram discounts" },
    { OPT_STRING, "gt5", &gtFile[5], "Good-Turing 5gram discounts" },
    { OPT_STRING, "gt6", &gtFile[6], "Good-Turing 6gram discounts" },
    { OPT_STRING, "gt7", &gtFile[7], "Good-Turing 7gram discounts" },
    { OPT_STRING, "gt8", &gtFile[8], "Good-Turing 8gram discounts" },
    { OPT_STRING, "gt9", &gtFile[9], "Good-Turing 9gram discounts" },

    { OPT_FLOAT, "cdiscount", &cdiscount[0], "discounting constant" },
    { OPT_FLOAT, "cdiscount1", &cdiscount[1], "1gram discounting constant" },
    { OPT_FLOAT, "cdiscount2", &cdiscount[2], "2gram discounting constant" },
    { OPT_FLOAT, "cdiscount3", &cdiscount[3], "3gram discounting constant" },
    { OPT_FLOAT, "cdiscount4", &cdiscount[4], "4gram discounting constant" },
    { OPT_FLOAT, "cdiscount5", &cdiscount[5], "5gram discounting constant" },
    { OPT_FLOAT, "cdiscount6", &cdiscount[6], "6gram discounting constant" },
    { OPT_FLOAT, "cdiscount7", &cdiscount[7], "7gram discounting constant" },
    { OPT_FLOAT, "cdiscount8", &cdiscount[8], "8gram discounting constant" },
    { OPT_FLOAT, "cdiscount9", &cdiscount[9], "9gram discounting constant" },

    { OPT_TRUE, "ndiscount", &ndiscount[0], "use natural discounting" },
    { OPT_TRUE, "ndiscount1", &ndiscount[1], "1gram natural discounting" },
    { OPT_TRUE, "ndiscount2", &ndiscount[2], "2gram natural discounting" },
    { OPT_TRUE, "ndiscount3", &ndiscount[3], "3gram natural discounting" },
    { OPT_TRUE, "ndiscount4", &ndiscount[4], "4gram natural discounting" },
    { OPT_TRUE, "ndiscount5", &ndiscount[5], "5gram natural discounting" },
    { OPT_TRUE, "ndiscount6", &ndiscount[6], "6gram natural discounting" },
    { OPT_TRUE, "ndiscount7", &ndiscount[7], "7gram natural discounting" },
    { OPT_TRUE, "ndiscount8", &ndiscount[8], "8gram natural discounting" },
    { OPT_TRUE, "ndiscount9", &ndiscount[9], "9gram natural discounting" },

    { OPT_TRUE, "wbdiscount", &wbdiscount[0], "use Witten-Bell discounting" },
    { OPT_TRUE, "wbdiscount1", &wbdiscount[1], "1gram Witten-Bell discounting"},
    { OPT_TRUE, "wbdiscount2", &wbdiscount[2], "2gram Witten-Bell discounting"},
    { OPT_TRUE, "wbdiscount3", &wbdiscount[3], "3gram Witten-Bell discounting"},
    { OPT_TRUE, "wbdiscount4", &wbdiscount[4], "4gram Witten-Bell discounting"},
    { OPT_TRUE, "wbdiscount5", &wbdiscount[5], "5gram Witten-Bell discounting"},
    { OPT_TRUE, "wbdiscount6", &wbdiscount[6], "6gram Witten-Bell discounting"},
    { OPT_TRUE, "wbdiscount7", &wbdiscount[7], "7gram Witten-Bell discounting"},
    { OPT_TRUE, "wbdiscount8", &wbdiscount[8], "8gram Witten-Bell discounting"},
    { OPT_TRUE, "wbdiscount9", &wbdiscount[9], "9gram Witten-Bell discounting"},

    { OPT_TRUE, "kndiscount", &kndiscount[0], "use modified Kneser-Ney discounting" },
    { OPT_TRUE, "kndiscount1", &kndiscount[1], "1gram modified Kneser-Ney discounting"},
    { OPT_TRUE, "kndiscount2", &kndiscount[2], "2gram modified Kneser-Ney discounting"},
    { OPT_TRUE, "kndiscount3", &kndiscount[3], "3gram modified Kneser-Ney discounting"},
    { OPT_TRUE, "kndiscount4", &kndiscount[4], "4gram modified Kneser-Ney discounting"},
    { OPT_TRUE, "kndiscount5", &kndiscount[5], "5gram modified Kneser-Ney discounting"},
    { OPT_TRUE, "kndiscount6", &kndiscount[6], "6gram modified Kneser-Ney discounting"},
    { OPT_TRUE, "kndiscount7", &kndiscount[7], "7gram modified Kneser-Ney discounting"},
    { OPT_TRUE, "kndiscount8", &kndiscount[8], "8gram modified Kneser-Ney discounting"},
    { OPT_TRUE, "kndiscount9", &kndiscount[9], "9gram modified Kneser-Ney discounting"},

    { OPT_TRUE, "ukndiscount", &ukndiscount[0], "use original Kneser-Ney discounting" },
    { OPT_TRUE, "ukndiscount1", &ukndiscount[1], "1gram original Kneser-Ney discounting"},
    { OPT_TRUE, "ukndiscount2", &ukndiscount[2], "2gram original Kneser-Ney discounting"},
    { OPT_TRUE, "ukndiscount3", &ukndiscount[3], "3gram original Kneser-Ney discounting"},
    { OPT_TRUE, "ukndiscount4", &ukndiscount[4], "4gram original Kneser-Ney discounting"},
    { OPT_TRUE, "ukndiscount5", &ukndiscount[5], "5gram original Kneser-Ney discounting"},
    { OPT_TRUE, "ukndiscount6", &ukndiscount[6], "6gram original Kneser-Ney discounting"},
    { OPT_TRUE, "ukndiscount7", &ukndiscount[7], "7gram original Kneser-Ney discounting"},
    { OPT_TRUE, "ukndiscount8", &ukndiscount[8], "8gram original Kneser-Ney discounting"},
    { OPT_TRUE, "ukndiscount9", &ukndiscount[9], "9gram original Kneser-Ney discounting"},

    { OPT_STRING, "kn", &knFile[0], "Kneser-Ney discount parameter file" },
    { OPT_STRING, "kn1", &knFile[1], "Kneser-Ney 1gram discounts" },
    { OPT_STRING, "kn2", &knFile[2], "Kneser-Ney 2gram discounts" },
    { OPT_STRING, "kn3", &knFile[3], "Kneser-Ney 3gram discounts" },
    { OPT_STRING, "kn4", &knFile[4], "Kneser-Ney 4gram discounts" },
    { OPT_STRING, "kn5", &knFile[5], "Kneser-Ney 5gram discounts" },
    { OPT_STRING, "kn6", &knFile[6], "Kneser-Ney 6gram discounts" },
    { OPT_STRING, "kn7", &knFile[7], "Kneser-Ney 7gram discounts" },
    { OPT_STRING, "kn8", &knFile[8], "Kneser-Ney 8gram discounts" },
    { OPT_STRING, "kn9", &knFile[9], "Kneser-Ney 9gram discounts" },

    { OPT_TRUE, "kn-counts-modified", &knCountsModified, "input counts already modified for KN smoothing"},
    { OPT_TRUE, "kn-modify-counts-at-end", &knCountsModifyAtEnd, "modify counts after discount estimation rather than before"},

    { OPT_TRUE, "interpolate", &interpolate[0], "use interpolated estimates"},
    { OPT_TRUE, "interpolate1", &interpolate[1], "use interpolated 1gram estimates"},
    { OPT_TRUE, "interpolate2", &interpolate[2], "use interpolated 2gram estimates"},
    { OPT_TRUE, "interpolate3", &interpolate[3], "use interpolated 3gram estimates"},
    { OPT_TRUE, "interpolate4", &interpolate[4], "use interpolated 4gram estimates"},
    { OPT_TRUE, "interpolate5", &interpolate[5], "use interpolated 5gram estimates"},
    { OPT_TRUE, "interpolate6", &interpolate[6], "use interpolated 6gram estimates"},
    { OPT_TRUE, "interpolate7", &interpolate[7], "use interpolated 7gram estimates"},
    { OPT_TRUE, "interpolate8", &interpolate[8], "use interpolated 8gram estimates"},
    { OPT_TRUE, "interpolate9", &interpolate[9], "use interpolated 9gram estimates"},

    { OPT_STRING, "lm", &lmFile, "LM to estimate" },
    { OPT_STRING, "init-lm", &initLMFile, "initial LM for EM estimation" },
    { OPT_TRUE, "unk", &keepunk, "keep <unk> in LM" },
    { OPT_STRING, "map-unk", &mapUnknown, "word to map unknown words to" },
    { OPT_STRING, "meta-tag", &metaTag, "meta tag used to input count-of-count information" },
    { OPT_TRUE, "float-counts", &useFloatCounts, "use fractional counts" },
    { OPT_TRUE, "tagged", &tagged, "build a tagged LM" },
    { OPT_TRUE, "skip", &skipNgram, "build a skip N-gram LM" },
    { OPT_FLOAT, "skip-init", &skipInit, "default initial skip probability" },
    { OPT_UINT, "em-iters", &maxEMiters, "max number of EM iterations" },
    { OPT_FLOAT, "em-delta", &minEMdelta, "min log likelihood delta for EM" },
    { OPT_STRING, "stop-words", &stopWordFile, "stop-word vocabulary for stop-Ngram LM" },

    { OPT_TRUE, "tolower", &toLower, "map vocabulary to lowercase" },
    { OPT_TRUE, "trust-totals", &trustTotals, "trust lower-order counts for estimation" },
    { OPT_FLOAT, "prune", &prune, "prune redundant probs" },
    { OPT_UINT, "minprune", &minprune, "prune only ngrams at least this long" },
    { OPT_STRING, "vocab", &vocabFile, "vocab file" },
    { OPT_STRING, "nonevents", &noneventFile, "non-event vocabulary" },
    { OPT_STRING, "write-vocab", &writeVocab, "write vocab to file" },
    { OPT_TRUE, "memuse", &memuse, "show memory usage" },
    { OPT_DOC, 0, 0, "the default action is to write counts to stdout" }
};

int
main(int argc, char **argv)
{
    setlocale(LC_CTYPE, "");
    setlocale(LC_COLLATE, "");

    Boolean written = false;

    Opt_Parse(argc, argv, options, Opt_Number(options), 0);

    if (version) {
	printVersion(RcsId);
	exit(0);
    }

    if (useFloatCounts + tagged + skipNgram +
	(stopWordFile != 0) + (varPrune != 0.0) > 1)
    {
	cerr << "fractional counts, variable, tagged, stop-word Ngram and skip N-gram models are mutually exclusive\n";
	exit(2);
    }

    /*
     * Detect inconsistent discounting options
     */
    if (ndiscount[0] +
	wbdiscount[0] +
	(cdiscount[0] != -1.0) +
	ukndiscount[0] + 
	(knFile[0] != 0 || kndiscount[0]) +
	(gtFile[0] != 0) > 1)
    {
	cerr << "conflicting default discounting options\n";
	exit(2);
    }

    Vocab *vocab = tagged ? new TaggedVocab : new Vocab;
    assert(vocab);

    vocab->unkIsWord() = keepunk ? true : false;
    vocab->toLower() = toLower ? true : false;

    /*
     * Change unknown word string if requested
     */
    if (mapUnknown) {
	vocab->remove(vocab->unkIndex());
	vocab->unkIndex() = vocab->addWord(mapUnknown);
    }

    /*
     * Meta tag is used to input count-of-count information
     */
    if (metaTag) {
	vocab->metaTag() = metaTag;
    }

    SubVocab *stopWords = 0;

    if (stopWordFile != 0) {
	stopWords = new SubVocab(*vocab);
	assert(stopWords);
    }

    /*
     * The skip-ngram model requires count order one higher than
     * the normal model.
     */
    NgramStats *intStats =
	(stopWords != 0) ? new StopNgramStats(*vocab, *stopWords, order) :
	   tagged ? new TaggedNgramStats(*(TaggedVocab *)vocab, order) :
	      useFloatCounts ? 0 :
	         new NgramStats(*vocab, skipNgram ? order + 1 : order);
    NgramCounts<FloatCount> *floatStats =
	      !useFloatCounts ? 0 :
		 new NgramCounts<FloatCount>(*vocab, order);

#define USE_STATS(what) (useFloatCounts ? floatStats->what : intStats->what)

    if (useFloatCounts) {
	assert(floatStats != 0);
    } else {
	assert(intStats != 0);
    }

    USE_STATS(debugme(debug));

    if (vocabFile) {
	File file(vocabFile, "r");

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -