📄 ngram-count.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 2 页
字号:
上一页 12
	USE_STATS(vocab.read(file));
	USE_STATS(openVocab) = false;
    }

    if (stopWordFile) {
	File file(stopWordFile, "r");
	stopWords->read(file);
    }

    if (noneventFile) {
	/*
	 * create temporary sub-vocabulary for non-event words
	 */
	SubVocab nonEvents(USE_STATS(vocab));

	File file(noneventFile, "r");
	nonEvents.read(file);

	USE_STATS(vocab).addNonEvents(nonEvents);
    }

    if (readFile) {
	File file(readFile, "r");

	if (readWithMincounts) {
	    makeArray(unsigned, minCounts, order);

	    /* construct min-counts array from -gtNmin options */
	    unsigned i;
	    for (i = 0; i < order && i < maxorder; i ++) {
		minCounts[i] = gtmin[i + 1];
	    }
	    for ( ; i < order; i ++) {
		minCounts[i] = gtmin[0];
	    }
	    USE_STATS(readMinCounts(file, order, minCounts));
	} else {
	    USE_STATS(read(file));
	}
    }

    if (textFile) {
	File file(textFile, "r");
	USE_STATS(countFile(file));
    }

    if (memuse) {
	MemStats memuse;
	USE_STATS(memStats(memuse));
	memuse.print();
    }

    if (recompute) {
	if (useFloatCounts)
	    floatStats->sumCounts(order);
	else
	    intStats->sumCounts(order);
    }

    unsigned int i;
    for (i = 1; i <= maxorder; i++) {
	if (writeFile[i]) {
	    File file(writeFile[i], "w");
	    USE_STATS(write(file, i, sortNgrams));
	    written = true;
	}
    }

    /*
     * While ngrams themselves can have order 0 (they will always be empty)
     * we need order >= 1 for LM estimation.
     */
    if (order == 0) {
	cerr << "LM order must be positive -- set to 1\n";
	order = 1;
    }

    /*
     * This stores the discounting parameters for the various orders
     * Note this is only needed when estimating an LM
     */
    Discount **discounts = new Discount *[order];
    assert(discounts != 0);

    for (i = 0; i < order; i ++) {
	discounts[i] = 0;
    }

    /*
     * Estimate discounting parameters 
     * Note this is only required if 
     * - the user wants them written to a file
     * - we also want to estimate a LM later
     */
    for (i = 1; i <= order; i++) {
	/*
	 * Detect inconsistent options for this order
	 */
	if (i <= maxorder &&
	    ndiscount[i] + wbdiscount[i] + (cdiscount[i] != -1.0) +
	    ukndiscount[i] + (knFile[i] != 0 || kndiscount[i]) +
	    (gtFile[i] != 0) > 1)
	{
	    cerr << "conflicting discounting options for order " << i << endl;
	    exit(2);
	}

	/*
	 * Inherit default discounting method where needed
	 */
	if (i <= maxorder &&
	    !ndiscount[i] && !wbdiscount[i] && cdiscount[i] == -1.0 &&
	    !ukndiscount[i] && knFile[i] == 0 && !kndiscount[i] &&
	    gtFile[i] == 0)
	{
	    if (ndiscount[0]) ndiscount[i] = ndiscount[0];
	    else if (wbdiscount[0]) wbdiscount[i] = wbdiscount[0]; 
	    else if (cdiscount[0] != -1.0) cdiscount[i] = cdiscount[0];
	    else if (ukndiscount[0]) ukndiscount[i] = ukndiscount[0];
	    else if (kndiscount[0]) kndiscount[i] = kndiscount[0];

	    if (knFile[0] != 0) knFile[i] = knFile[0];
	    else if (gtFile[0] != 0) gtFile[i] = gtFile[0];
	}

	/*
	 * Choose discounting method to use
	 *
	 * Also, check for any discounting parameter files.
	 * These have a dual interpretation.
	 * If we're not estimating a new LM, simple WRITE the parameters
	 * out.  Otherwise try to READ them from these files.
	 *
	 * Note: Test for ukndiscount[] before knFile[] so that combined use 
	 * of -ukndiscountN and -knfileN will do the right thing.
	 */
	unsigned useorder = (i > maxorder) ? 0 : i;
	Discount *discount = 0;

	if (ndiscount[useorder]) {
	    if (debug) cerr << "using NaturalDiscount for " << i << "-grams";
	    discount = new NaturalDiscount(gtmin[useorder]);
	    assert(discount);
	} else if (wbdiscount[useorder]) {
	    if (debug) cerr << "using WittenBell for " << i << "-grams";
	    discount = new WittenBell(gtmin[useorder]);
	    assert(discount);
	} else if (cdiscount[useorder] != -1.0) {
	    if (debug) cerr << "using ConstDiscount for " << i << "-grams";
	    discount = new ConstDiscount(cdiscount[useorder], gtmin[useorder]);
	    assert(discount);
	} else if (ukndiscount[useorder]) {
	    if (debug) cerr << "using KneserNey for " << i << "-grams";
	    discount = new KneserNey(gtmin[useorder], knCountsModified, knCountsModifyAtEnd);
	    assert(discount);
	} else if (knFile[useorder] || kndiscount[useorder]) {
	    if (debug) cerr << "using ModKneserNey for " << i << "-grams";
	    discount = new ModKneserNey(gtmin[useorder], knCountsModified, knCountsModifyAtEnd);
	    assert(discount);
	} else if (gtFile[useorder] || (i <= order && lmFile)) {
	    if (debug) cerr << "using GoodTuring for " << i << "-grams";
	    discount = new GoodTuring(gtmin[useorder], gtmax[useorder]);
	    assert(discount);
	}
	if (debug) cerr << endl;

	/*
	 * Now read in, or estimate the discounting parameters.
	 * Also write them out if no language model is being created.
	 */
	if (discount) {
	    discount->debugme(debug);

	    if (interpolate[0] || interpolate[useorder]) {
		discount->interpolate = true;
	    }

	    if (knFile[useorder] && lmFile) {
		File file(knFile[useorder], "r");

		if (!discount->read(file)) {
		    cerr << "error in reading discount parameter file "
			 << knFile[useorder] << endl;
		    exit(1);
		}
	    } else if (gtFile[useorder] && lmFile) {
		File file(gtFile[useorder], "r");

		if (!discount->read(file)) {
		    cerr << "error in reading discount parameter file "
			 << gtFile[useorder] << endl;
		    exit(1);
		}
	    } else {
		/*
		 * Estimate discount params, and write them only if 
		 * a file was specified, but no language model is
		 * being estimated.
		 */
		if (!(useFloatCounts ? discount->estimate(*floatStats, i) :
				       discount->estimate(*intStats, i)))
		{
		    cerr << "error in discount estimator for order "
			 << i << endl;
		    exit(1);
		}
		if (knFile[useorder]) {
		    File file(knFile[useorder], "w");
		    discount->write(file);
		    written = true;
		} else if (gtFile[useorder]) {
		    File file(gtFile[useorder], "w");
		    discount->write(file);
		    written = true;
		}
	    }

	    discounts[i-1] = discount;
	}
    }

    /*
     * Estimate a new model from the existing counts,
     * either using a default discounting scheme, or the GT parameters
     * read in from files
     */
    if (lmFile) {
	Ngram *lm;
	
	if (varPrune != 0.0) {
	    lm = new VarNgram(*vocab, order, varPrune);
	    assert(lm != 0);
	} else if (skipNgram) {
	    SkipNgram *skipLM =  new SkipNgram(*vocab, order);
	    assert(skipLM != 0);

	    skipLM->maxEMiters = maxEMiters;
	    skipLM->minEMdelta = minEMdelta;
	    skipLM->initialSkipProb = skipInit;

	    lm = skipLM;
	} else {
	    lm = (stopWords != 0) ? new StopNgram(*vocab, *stopWords, order) :
		       tagged ? new TaggedNgram(*(TaggedVocab *)vocab, order) :
			  new Ngram(*vocab, order);
	    assert(lm != 0);
	}

	/*
	 * Set debug level on LM object
	 */
	lm->debugme(debug);

	/*
	 * Read initial LM parameters in case we're doing EM
	 */
	if (initLMFile) {
	    File file(initLMFile, "r");

	    if (!lm->read(file)) {
		cerr << "format error in init-lm file\n";
		exit(1);
	    }
	}
        
	if (trustTotals) {
	    lm->trustTotals() = true;
	}
	if (!(useFloatCounts ? lm->estimate(*floatStats, discounts) :
			       lm->estimate(*intStats, discounts)))
	{
	    cerr << "LM estimation failed\n";
	    exit(1);
	} else {
	    /*
	     * Remove redundant probs (perplexity increase below threshold)
	     */
	    if (prune != 0.0) {
		lm->pruneProbs(prune, minprune);
	    }

	    File file(lmFile, "w");
	    lm->write(file);
	}
	written = true;

	// XXX: don't free the lm since this itself may take a long time
	// and we're going to exit anyways.
#ifdef DEBUG
	delete lm;
#endif
    }

    if (writeVocab) {
	File file(writeVocab, "w");
	vocab->write(file);
	written = true;
    }

    /*
     * If nothing has been written out so far, make it the default action
     * to dump the counts 
     *
     * Note: This will write the modified rather than the original counts
     * if KN discounting was used.
     */
    if (writeFile[0] || !written) {
	File file(writeFile[0] ? writeFile[0] : "-", "w");
	USE_STATS(write(file, writeOrder, sortNgrams));
    }

#ifdef DEBUG
    /*
     * Free all objects
     */
    for (i = 0; i < order; i ++) {
	delete discounts[i];
	discounts[i] = 0;
    }
    delete [] discounts;

    delete intStats;
    delete floatStats;

    if (stopWords != 0) {
	delete stopWords;
    }

    delete vocab;
    return(0);
#endif /* DEBUG */

    exit(0);
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -