📄 ngram-count.cc
字号:
USE_STATS(vocab.read(file));
USE_STATS(openVocab) = false;
}
if (stopWordFile) {
File file(stopWordFile, "r");
stopWords->read(file);
}
if (noneventFile) {
/*
* create temporary sub-vocabulary for non-event words
*/
SubVocab nonEvents(USE_STATS(vocab));
File file(noneventFile, "r");
nonEvents.read(file);
USE_STATS(vocab).addNonEvents(nonEvents);
}
if (readFile) {
File file(readFile, "r");
if (readWithMincounts) {
makeArray(unsigned, minCounts, order);
/* construct min-counts array from -gtNmin options */
unsigned i;
for (i = 0; i < order && i < maxorder; i ++) {
minCounts[i] = gtmin[i + 1];
}
for ( ; i < order; i ++) {
minCounts[i] = gtmin[0];
}
USE_STATS(readMinCounts(file, order, minCounts));
} else {
USE_STATS(read(file));
}
}
if (textFile) {
File file(textFile, "r");
USE_STATS(countFile(file));
}
if (memuse) {
MemStats memuse;
USE_STATS(memStats(memuse));
memuse.print();
}
if (recompute) {
if (useFloatCounts)
floatStats->sumCounts(order);
else
intStats->sumCounts(order);
}
unsigned int i;
for (i = 1; i <= maxorder; i++) {
if (writeFile[i]) {
File file(writeFile[i], "w");
USE_STATS(write(file, i, sortNgrams));
written = true;
}
}
/*
* While ngrams themselves can have order 0 (they will always be empty)
* we need order >= 1 for LM estimation.
*/
if (order == 0) {
cerr << "LM order must be positive -- set to 1\n";
order = 1;
}
/*
* This stores the discounting parameters for the various orders
* Note this is only needed when estimating an LM
*/
Discount **discounts = new Discount *[order];
assert(discounts != 0);
for (i = 0; i < order; i ++) {
discounts[i] = 0;
}
/*
* Estimate discounting parameters
* Note this is only required if
* - the user wants them written to a file
* - we also want to estimate a LM later
*/
for (i = 1; i <= order; i++) {
/*
* Detect inconsistent options for this order
*/
if (i <= maxorder &&
ndiscount[i] + wbdiscount[i] + (cdiscount[i] != -1.0) +
ukndiscount[i] + (knFile[i] != 0 || kndiscount[i]) +
(gtFile[i] != 0) > 1)
{
cerr << "conflicting discounting options for order " << i << endl;
exit(2);
}
/*
* Inherit default discounting method where needed
*/
if (i <= maxorder &&
!ndiscount[i] && !wbdiscount[i] && cdiscount[i] == -1.0 &&
!ukndiscount[i] && knFile[i] == 0 && !kndiscount[i] &&
gtFile[i] == 0)
{
if (ndiscount[0]) ndiscount[i] = ndiscount[0];
else if (wbdiscount[0]) wbdiscount[i] = wbdiscount[0];
else if (cdiscount[0] != -1.0) cdiscount[i] = cdiscount[0];
else if (ukndiscount[0]) ukndiscount[i] = ukndiscount[0];
else if (kndiscount[0]) kndiscount[i] = kndiscount[0];
if (knFile[0] != 0) knFile[i] = knFile[0];
else if (gtFile[0] != 0) gtFile[i] = gtFile[0];
}
/*
* Choose discounting method to use
*
* Also, check for any discounting parameter files.
* These have a dual interpretation.
* If we're not estimating a new LM, simple WRITE the parameters
* out. Otherwise try to READ them from these files.
*
* Note: Test for ukndiscount[] before knFile[] so that combined use
* of -ukndiscountN and -knfileN will do the right thing.
*/
unsigned useorder = (i > maxorder) ? 0 : i;
Discount *discount = 0;
if (ndiscount[useorder]) {
if (debug) cerr << "using NaturalDiscount for " << i << "-grams";
discount = new NaturalDiscount(gtmin[useorder]);
assert(discount);
} else if (wbdiscount[useorder]) {
if (debug) cerr << "using WittenBell for " << i << "-grams";
discount = new WittenBell(gtmin[useorder]);
assert(discount);
} else if (cdiscount[useorder] != -1.0) {
if (debug) cerr << "using ConstDiscount for " << i << "-grams";
discount = new ConstDiscount(cdiscount[useorder], gtmin[useorder]);
assert(discount);
} else if (ukndiscount[useorder]) {
if (debug) cerr << "using KneserNey for " << i << "-grams";
discount = new KneserNey(gtmin[useorder], knCountsModified, knCountsModifyAtEnd);
assert(discount);
} else if (knFile[useorder] || kndiscount[useorder]) {
if (debug) cerr << "using ModKneserNey for " << i << "-grams";
discount = new ModKneserNey(gtmin[useorder], knCountsModified, knCountsModifyAtEnd);
assert(discount);
} else if (gtFile[useorder] || (i <= order && lmFile)) {
if (debug) cerr << "using GoodTuring for " << i << "-grams";
discount = new GoodTuring(gtmin[useorder], gtmax[useorder]);
assert(discount);
}
if (debug) cerr << endl;
/*
* Now read in, or estimate the discounting parameters.
* Also write them out if no language model is being created.
*/
if (discount) {
discount->debugme(debug);
if (interpolate[0] || interpolate[useorder]) {
discount->interpolate = true;
}
if (knFile[useorder] && lmFile) {
File file(knFile[useorder], "r");
if (!discount->read(file)) {
cerr << "error in reading discount parameter file "
<< knFile[useorder] << endl;
exit(1);
}
} else if (gtFile[useorder] && lmFile) {
File file(gtFile[useorder], "r");
if (!discount->read(file)) {
cerr << "error in reading discount parameter file "
<< gtFile[useorder] << endl;
exit(1);
}
} else {
/*
* Estimate discount params, and write them only if
* a file was specified, but no language model is
* being estimated.
*/
if (!(useFloatCounts ? discount->estimate(*floatStats, i) :
discount->estimate(*intStats, i)))
{
cerr << "error in discount estimator for order "
<< i << endl;
exit(1);
}
if (knFile[useorder]) {
File file(knFile[useorder], "w");
discount->write(file);
written = true;
} else if (gtFile[useorder]) {
File file(gtFile[useorder], "w");
discount->write(file);
written = true;
}
}
discounts[i-1] = discount;
}
}
/*
* Estimate a new model from the existing counts,
* either using a default discounting scheme, or the GT parameters
* read in from files
*/
if (lmFile) {
Ngram *lm;
if (varPrune != 0.0) {
lm = new VarNgram(*vocab, order, varPrune);
assert(lm != 0);
} else if (skipNgram) {
SkipNgram *skipLM = new SkipNgram(*vocab, order);
assert(skipLM != 0);
skipLM->maxEMiters = maxEMiters;
skipLM->minEMdelta = minEMdelta;
skipLM->initialSkipProb = skipInit;
lm = skipLM;
} else {
lm = (stopWords != 0) ? new StopNgram(*vocab, *stopWords, order) :
tagged ? new TaggedNgram(*(TaggedVocab *)vocab, order) :
new Ngram(*vocab, order);
assert(lm != 0);
}
/*
* Set debug level on LM object
*/
lm->debugme(debug);
/*
* Read initial LM parameters in case we're doing EM
*/
if (initLMFile) {
File file(initLMFile, "r");
if (!lm->read(file)) {
cerr << "format error in init-lm file\n";
exit(1);
}
}
if (trustTotals) {
lm->trustTotals() = true;
}
if (!(useFloatCounts ? lm->estimate(*floatStats, discounts) :
lm->estimate(*intStats, discounts)))
{
cerr << "LM estimation failed\n";
exit(1);
} else {
/*
* Remove redundant probs (perplexity increase below threshold)
*/
if (prune != 0.0) {
lm->pruneProbs(prune, minprune);
}
File file(lmFile, "w");
lm->write(file);
}
written = true;
// XXX: don't free the lm since this itself may take a long time
// and we're going to exit anyways.
#ifdef DEBUG
delete lm;
#endif
}
if (writeVocab) {
File file(writeVocab, "w");
vocab->write(file);
written = true;
}
/*
* If nothing has been written out so far, make it the default action
* to dump the counts
*
* Note: This will write the modified rather than the original counts
* if KN discounting was used.
*/
if (writeFile[0] || !written) {
File file(writeFile[0] ? writeFile[0] : "-", "w");
USE_STATS(write(file, writeOrder, sortNgrams));
}
#ifdef DEBUG
/*
* Free all objects
*/
for (i = 0; i < order; i ++) {
delete discounts[i];
discounts[i] = 0;
}
delete [] discounts;
delete intStats;
delete floatStats;
if (stopWords != 0) {
delete stopWords;
}
delete vocab;
return(0);
#endif /* DEBUG */
exit(0);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -