📄 ngramlm.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 4 页
字号:
		file.position() << "unexpected input\n";
		return false;
	    }

	default:	/* reading n-grams, where n == state */

	    if (backslash) {
		if (numOOVs > 0) {
		    if (debug(DEBUG_READ_STATS)) {
			dout() << "discarded " << numOOVs
			       << " OOV " << state << "-grams\n";
		    }
		    numOOVs = 0;
		}
	    }

	    if (backslash && sscanf(line, "\\%d-grams", &state) == 1) {
		if (state < 1 || state > (int)maxOrder) {
		    file.position() << "invalid ngram order " << state << "\n";
		    return false;
		}

	        if (debug(DEBUG_READ_STATS)) {
		    dout() << (state <= (int)order ? "reading " : "skipping ")
			   << numNgrams[state] << " "
			   << state << "-grams\n";
		}

		/*
		 * start reading more n-grams
		 */
		continue;
	    } else if (backslash && strncmp(line, "\\end\\", 5) == 0) {
		/*
		 * Check that the total number of ngrams read matches
		 * that found in the header
		 */
		for (int i = 0; i <= (int)maxOrder && i <= (int)order; i++) {
		    if (numNgrams[i] != numRead[i]) {
			file.position() << "warning: " << numRead[i] << " "
			                << i << "-grams read, expected "
			                << numNgrams[i] << "\n";
		    }
		}

		return true;
	    } else if (state > (int)order) {
		/*
		 * Save time and memory by skipping ngrams outside
		 * the order range of this model
		 */
		continue;
	    } else {
		VocabString words[1+ maxNgramOrder + 1 + 1];
				/* result of parsing an n-gram line
				 * the first and last elements are actually
				 * numerical parameters, but so what? */
		VocabIndex wids[maxNgramOrder + 1];
				/* ngram translated to word indices */
		LogP prob, bow; /* probability and back-off-weight */

		/*
		 * Parse a line of the form
		 *	<prob>	<w1> <w2> ...	<bow>
		 */
		unsigned howmany = Vocab::parseWords(line, words, state + 3);

		if ((int)howmany < state + 1 || (int)howmany > state + 2) {
		    file.position() << "ngram line has " << howmany
				    << " fields (" << state + 2
				    << " expected)\n";
		    return false;
		}

		/*
		 * Parse prob
		 */
		if (!parseLogP(words[0], prob)) {
		    file.position() << "bad prob \"" << words[0] << "\"\n";
		    return false;
		} else if (prob > LogP_One || prob != prob) {
		    file.position() << "warning: questionable prob \""
				    << words[0] << "\"\n";
		} else if (prob == LogP_PseudoZero) {
		    /*
		     * convert pseudo-zeros back into real zeros
		     */
		    prob = LogP_Zero;
		}

		/* 
		 * Parse bow, if any
		 */
		if (howmany == state + 2) {
		    /*
		     * Parsing floats strings is the most time-consuming
		     * part of reading in backoff models.  We therefore
		     * try to avoid parsing bows where they are useless,
		     * i.e., for contexts that are longer than what this
		     * model uses.  We also do a quick sanity check to
		     * warn about non-zero bows in that position.
		     */
		    if (state == maxOrder) {
			if (words[state + 1][0] != '0') {
			    file.position() << "ignoring non-zero bow \""
					    << words[state + 1]
					    << "\" for maximal ngram\n";
			}
		    } else if (state == order) {
			/*
			 * save time and memory by skipping bows that will
			 * never be used as a result of higher-order ngram
			 * skipping
			 */
			;
		    } else if (!parseLogP(words[state + 1], bow)) {
			file.position() << "bad bow \"" << words[state + 1]
					<< "\"\n";
			return false;
		    } else if (bow == LogP_Inf || bow != bow) {
			file.position() << "warning: questionable bow \""
		                    	<< words[state + 1] << "\"\n";
		    } else if (bow == LogP_PseudoZero) {
			/*
			 * convert pseudo-zeros back into real zeros
			 */
			bow = LogP_Zero;
		    }
		}

		numRead[state] ++;

		/* 
		 * Terminate the words array after the last word,
		 * then translate it to word indices.  We also
		 * reverse the ngram since that's how we'll need it
		 * to index the trie.
		 */
		words[state + 1] = 0;
		if (limitVocab) {
		    /*
		     * Skip N-gram if it contains OOV word.
		     * Do NOT add new words to the vocabulary.
		     */
		    vocab.getIndices(&words[1], wids, maxNgramOrder);

		    Boolean haveOOV = false;
		    for (unsigned j = 0; j < (unsigned)state; j ++) {
			if (wids[j] == Vocab_None) {
			    haveOOV = true;
			    break;
			}
		    }
		    if (haveOOV) {
			numOOVs ++;
			/* skip rest of processing and go to next ngram */
			continue;
		    }
		} else {
		    vocab.addWords(&words[1], wids, maxNgramOrder);
		}
		Vocab::reverse(wids);
		
		/*
		 * Store bow, if any
		 */
		if ((int)howmany == state + 2 && state < (int)order) {
		    *insertBOW(wids) = bow;
		}

		/*
		 * Save the last word (which is now the first, due to reversal)
		 * then use the first n-1 to index into
		 * the context trie, storing the prob.
		 */
		BOnode *bonode = contexts.find(&wids[1]);
		if (!bonode) {
		    file.position() << "warning: no bow for prefix of ngram \""
				    << &words[1] << "\"\n";
		} else {
		    if (!warnedAboutUnk &&
			wids[0] == vocab.unkIndex() &&
			prob != LogP_Zero &&
		    	!vocab.unkIsWord())
		    {
			file.position() << "warning: non-zero probability for "
			                << vocab.getWord(vocab.unkIndex())
			                << " in closed-vocabulary LM\n";
			warnedAboutUnk = true;
		    }

		    /* efficient for: *insertProb(wids[0], &wids[1]) = prob */
		    *bonode->probs.insert(wids[0]) = prob;
		}

		/*
		 * Hey, we're done with this ngram!
		 */
	    }
	}
    }

    /*
     * we reached a premature EOF
     */
    file.position() << "reached EOF before \\end\\\n";
    return false;
}

void
Ngram::writeWithOrder(File &file, unsigned order)
{
    unsigned i;
    unsigned howmanyNgrams[maxNgramOrder + 1];
    VocabIndex context[maxNgramOrder + 2];
    VocabString scontext[maxNgramOrder + 1];

    if (order > maxNgramOrder) {
	order = maxNgramOrder;
    }

    fprintf(file, "\n\\data\\\n");

    for (i = 1; i <= order; i++ ) {
	howmanyNgrams[i] = numNgrams(i);
	fprintf(file, "ngram %d=%d\n", i, howmanyNgrams[i]);
    }

    for (i = 1; i <= order; i++ ) {
	fprintf(file, "\n\\%d-grams:\n", i);

        if (debug(DEBUG_WRITE_STATS)) {
	    dout() << "writing " << howmanyNgrams[i] << " "
		   << i << "-grams\n";
	}
        
	NgramBOsIter iter(*this, context + 1, i - 1, vocab.compareIndex());
	BOnode *node;

	while (node = iter.next()) {

	    vocab.getWords(context + 1, scontext, maxNgramOrder + 1);
	    Vocab::reverse(scontext);

	    NgramProbsIter piter(*node, vocab.compareIndex());
	    VocabIndex pword;
	    LogP *prob;

	    while (prob = piter.next(pword)) {
		if (file.error()) {
		    return;
		}

		fprintf(file, "%.*lg\t", LogP_Precision,
				(double)(*prob == LogP_Zero ?
						LogP_PseudoZero : *prob));
		Vocab::write(file, scontext);
		fprintf(file, "%s%s", (i > 1 ? " " : ""), vocab.getWord(pword));

		if (i < order) {
		    context[0] = pword;

		    LogP *bow = findBOW(context);
		    if (bow) {
			fprintf(file, "\t%.*lg", LogP_Precision,
					(double)(*bow == LogP_Zero ?
						    LogP_PseudoZero : *bow));
		    }
			
			/*Add by zgchen 2006-7-27*/
			else
			{
				fprintf(file, "\t%.*lg", LogP_Precision,(double)(0));
			}
			/*Add by zgchen 2006-7-27*/
		}

		fprintf(file, "\n");
	    }
	}
    }

    fprintf(file, "\n\\end\\\n");
}

unsigned int
Ngram::numNgrams(unsigned int order)
{
    if (order < 1) {
	return 0;
    } else {
	unsigned int howmany = 0;

	makeArray(VocabIndex, context, order + 1);

	NgramBOsIter iter(*this, context, order - 1);
	BOnode *node;

	while (node = iter.next()) {
	    howmany += node->probs.numEntries();
	}

	return howmany;
    }
}

/*
 * Estimation
 */

Boolean
Ngram::estimate(NgramStats &stats, unsigned *mincounts, unsigned *maxcounts)
{
    /*
     * If no discount method was specified we do the default, standard
     * thing. Good Turing discounting with the specified min and max counts
     * for all orders.
     */
    Discount **discounts = new Discount *[order];
    assert(discounts != 0);
    unsigned i;
    Boolean error = false;

    for (i = 1; !error && i <= order; i++) {
	discounts[i-1] =
		new GoodTuring(mincounts ? mincounts[i-1] : GT_defaultMinCount,
			       maxcounts ? maxcounts[i-1] : GT_defaultMaxCount);
	/*
	 * Transfer the LMStats's debug level to the newly
	 * created discount objects
	 */
	discounts[i-1]->debugme(stats.debuglevel());

	if (!discounts[i-1]->estimate(stats, i)) {
	    cerr << "failed to estimate GT discount for order " << i + 1
		 << endl;
	    error = true;
	} else if (debug(DEBUG_PRINT_GTPARAMS)) {
		dout() << "Good Turing parameters for " << i << "-grams:\n";
		File errfile(stderr);
		discounts[i-1]->write(errfile);
	}
    }

    if (!error) {
	error = !estimate(stats, discounts);
    }

    for (i = 1; i <= order; i++) {
	delete discounts[i-1];
    }
    delete [] discounts;

    return !error;
}

/*
 * Count number of vocabulary items that get probability mass
 */
unsigned
Ngram::vocabSize()
{
    unsigned numWords = 0;
    VocabIter viter(vocab);
    VocabIndex word;

    while (viter.next(word)) {
	if (!vocab.isNonEvent(word) && !vocab.isMetaTag(word)) {
	    numWords ++;
	}
    }
    return numWords;
}

/*
 * Generic version of estimate(NgramStats, Discount)
 *                and estimate(NgramCounts<FloatCount>, Discount)
 *
 * XXX: This function is essentially duplicated in other places.
 * Propate changes to VarNgram::estimate().
 */
template <class CountType>
Boolean
Ngram::estimate2(NgramCounts<CountType> &stats, Discount **discounts)
{
    /*
     * For all ngrams, compute probabilities and apply the discount
     * coefficients.
     */
    makeArray(VocabIndex, context, order);
    unsigned vocabSize = Ngram::vocabSize();

    /*
     * Remove all old contexts
     */
    clear();

    /*
     * Ensure <s> unigram exists (being a non-event, it is not inserted
     * in distributeProb(), yet is assumed by much other software).
     */
    if (vocab.ssIndex() != Vocab_None) {
	context[0] = Vocab_None;
	*insertProb(vocab.ssIndex(), context) = LogP_Zero;
    }

    for (unsigned i = 1; i <= order; i++) {
	unsigned noneventContexts = 0;
	unsigned noneventNgrams = 0;
	unsigned discountedNgrams = 0;

	/*
	 * check if discounting is disabled for this round
	 */
	Boolean noDiscount =
			(discounts == 0) ||
			(discounts[i-1] == 0) ||
			discounts[i-1]->nodiscount();

	Boolean interpolate =
			(discounts != 0) &&
			(discounts[i-1] != 0) &&
			discounts[i-1]->interpolate;

	/*
	 * modify counts are required by discounting method
	 */
	if (!noDiscount && discounts && discounts[i-1]) {
	    discounts[i-1]->prepareCounts(stats, i, order);
	}

	/*
	 * This enumerates all contexts, i.e., i-1 grams.
	 */
	CountType *contextCount;
	NgramCountsIter<CountType> contextIter(stats, context, i-1);

	while (contextCount = contextIter.next()) {
	    /*
	     * Skip contexts ending in </s>.  This typically only occurs
	     * with the doubling of </s> to generate trigrams from
	     * bigrams ending in </s>.
	     * If <unk> is not real word, also skip context that contain
	     * it.
	     */
	    if (i > 1 && context[i-2] == vocab.seIndex() ||
	        vocab.isNonEvent(vocab.unkIndex()) &&
				 vocab.contains(context, vocab.unkIndex()))
	    {
		noneventContexts ++;
		continue;
	    }

	    VocabIndex word[2];	/* the follow word */
	    NgramCountsIter<CountType> followIter(stats, context, word, 1);
	    CountType *ngramCount;

	    /*
	     * Total up the counts for the denominator
	     * (the lower-order counts may not be consistent with
	     * the higher-order ones, so we can't just use *contextCount)
	     * Only if the trustTotal flag is set do we override this
	     * with the count from the context ngram.
	     */
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -