📄 ngramstats.cc

📁 这是一款很好用的工具包
💻 CC
字号:
/* * NgramStats.cc -- *	N-gram counting * */#ifndef _NgramStats_cc_#define _NgramStats_cc_#ifndef lintstatic char NgramStats_Copyright[] = "Copyright (c) 1995-2006 SRI International.  All Rights Reserved.";static char NgramStats_RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/NgramStats.cc,v 1.32 2006/01/05 20:21:27 stolcke Exp $";#endif#include <iostream>using namespace std;#include <string.h>const unsigned maxLineLength = 10000;#include "NgramStats.h"#include "Trie.cc"#include "LHash.cc"#include "Array.cc"#define INSTANTIATE_NGRAMCOUNTS(CountT) \	INSTANTIATE_TRIE(VocabIndex,CountT); \	template class NgramCounts<CountT>template <class CountT>voidNgramCounts<CountT>::dump(){    cerr << "order = " << order << endl;    counts.dump();}template <class CountT>voidNgramCounts<CountT>::memStats(MemStats &stats){    stats.total += sizeof(*this) - sizeof(counts) - sizeof(vocab);    vocab.memStats(stats);    counts.memStats(stats);}template <class CountT>NgramCounts<CountT>::NgramCounts(Vocab &vocab, unsigned int maxOrder)    : LMStats(vocab), order(maxOrder){}template <class CountT>unsigned NgramCounts<CountT>::countSentence(const VocabString *words, CountT factor){    static VocabIndex wids[maxWordsPerLine + 3];    unsigned int howmany;    if (openVocab) {	howmany = vocab.addWords(words, wids + 1, maxWordsPerLine + 1);    } else {	howmany = vocab.getIndices(words, wids + 1, maxWordsPerLine + 1,					    vocab.unkIndex());    }    /*     * Check for buffer overflow     */    if (howmany == maxWordsPerLine + 1) {	return 0;    }    /*     * update OOV count     */    if (!openVocab) {	for (unsigned i = 1; i <= howmany; i++) {	    if (wids[i] == vocab.unkIndex()) {		stats.numOOVs ++;	    }	}    }    /*     * Insert begin/end sentence tokens if necessary     */    VocabIndex *start;    	/*	这里根据我们训练得需要改动如下，如果开头和结尾为<s>或</s>，就不加token，2006-7-26，by zgchen	*/    //if (wids[1] == vocab.ssIndex()) {	if (wids[1] == vocab.ssIndex() || wids[1] == vocab.seIndex()) {	start = wids + 1;    } else {	wids[0] = vocab.ssIndex();	start = wids;    }    //if (wids[howmany] != vocab.seIndex()) {	if (wids[howmany] != vocab.seIndex() && wids[howmany] != vocab.ssIndex()) {	wids[howmany + 1] = vocab.seIndex();	wids[howmany + 2] = Vocab_None;    }    return countSentence(start, factor);}/* * Incrememt counts indexed by words, starting at node. */template <class CountT>voidNgramCounts<CountT>::incrementCounts(const VocabIndex *words,					unsigned minOrder, CountT factor){    NgramNode *node = &counts;    for (int i = 0; i < (int)order; i++) {	VocabIndex wid = words[i];	/*	 * check of end-of-sentence	 */        if (wid == Vocab_None) {	    break;	} else {	    node = node->insertTrie(wid);	    if ((unsigned int)(i + 1) >= minOrder) {		node->value() += factor;	    }	}    }}template <class CountT>unsignedNgramCounts<CountT>::countSentence(const VocabIndex *words, CountT factor){    unsigned int start;    for (start = 0; words[start] != Vocab_None; start++) {        incrementCounts(words + start, 1, factor);    }    /*     * keep track of word and sentence counts     */	/*	这里同样改动，by zgchen， 2006-7-27	*/    stats.numWords += start;    //if (words[0] == vocab.ssIndex()) {	if (words[0] == vocab.ssIndex() || words[0] == vocab.seIndex()) {	stats.numWords --;    }    //if (start > 0 && words[start-1] == vocab.seIndex()) {	if (start > 0 && (words[start-1] == vocab.seIndex() || words[start-1] == vocab.ssIndex())) {	stats.numWords --;    }    stats.numSentences ++;    return start;}/* * Type-dependent count <--> string conversions *///#ifdef INSTANTIATE_TEMPLATESstatic//#endifchar ctsBuffer[100];template <class CountT>static inline const char *countToString(CountT count){    sprintf(ctsBuffer, "%lg", (double)count);    return ctsBuffer;}static inline const char *countToString(unsigned count){    sprintf(ctsBuffer, "%u", count);    return ctsBuffer;}static inline const char *countToString(int count){    sprintf(ctsBuffer, "%d", count);    return ctsBuffer;}template <class CountT>static inline BooleanstringToCount(const char *str, CountT &count){    double x;    if (sscanf(str, "%lf", &x) == 1) {	count = (CountT)x;	return true;    } else {	return false;    }}static inline BooleanstringToCount(const char *str, unsigned int &count){    /*     * scanf("%u") doesn't check for a positive sign, so we have to ourselves.     */    return (*str != '-' && sscanf(str, "%u", &count) == 1);}static inline BooleanstringToCount(const char *str, unsigned short &count){    /*     * scanf("%u") doesn't check for a positive sign, so we have to ourselves.     */    return (*str != '-' && sscanf(str, "%hu", &count) == 1);}static inline BooleanstringToCount(const char *str, XCount &count){    unsigned x;    if (stringToCount(str, x)) {    	count = x;	return true;    } else {    	return false;    }}static inline BooleanstringToCount(const char *str, int &count){    return (sscanf(str, "%d", &count) == 1);}template <class CountT>unsigned intNgramCounts<CountT>::parseNgram(char *line,		      VocabString *words,		      unsigned int max,		      CountT &count){    unsigned howmany = Vocab::parseWords(line, words, max);    if (howmany == max) {	return 0;    }    /*     * Parse the last word as a count     */    if (!stringToCount(words[howmany - 1], count)) {	return 0;    }    howmany --;    words[howmany] = 0;    return howmany;}template <class CountT>unsigned intNgramCounts<CountT>::readNgram(File &file,		      VocabString *words,		      unsigned int max,		      CountT &count){    char *line;    /*     * Read next ngram count from file, skipping blank lines     */    line = file.getline();    if (line == 0) {	return 0;    }    unsigned howmany = parseNgram(line, words, max, count);    if (howmany == 0) {	file.position() << "malformed N-gram count or more than " << max - 1 << " words per line\n";	return 0;    }    return howmany;}template <class CountT>BooleanNgramCounts<CountT>::read(File &file, unsigned int order){    VocabString words[maxNgramOrder + 1];    VocabIndex wids[maxNgramOrder + 1];    CountT count;    unsigned int howmany;    while (howmany = readNgram(file, words, maxNgramOrder + 1, count)) {	/*	 * Skip this entry if the length of the ngram exceeds our 	 * maximum order	 */	if (howmany > order) {	    continue;	}	/* 	 * Map words to indices	 */	if (openVocab) {	    vocab.addWords(words, wids, maxNgramOrder);	} else {	    vocab.getIndices(words, wids, maxNgramOrder, vocab.unkIndex());	}	/*	 *  Update the count	 */	*counts.insert(wids) += count;    }    /*     * XXX: always return true for now, should return false if there was     * a format error in the input file.     */    return true;}/* * Helper function to record a set of meta-counts at a given prefix into * the ngram tree */template <class CountT>voidNgramCounts<CountT>::addCounts(const VocabIndex *prefix,				const LHash<VocabIndex, CountT> &set){    NgramNode *node = counts.insertTrie(prefix);    LHashIter<VocabIndex, CountT> setIter(set);    VocabIndex word;    CountT *count;    while (count = setIter.next(word)) {	*node->insert(word) += *count;    }}/* * Read a counts file discarding counts below given minimum counts * Assumes that counts are merged, i.e., ngram order is generated by a * pre-order traversal of the ngram tree. */template <class CountT>BooleanNgramCounts<CountT>::readMinCounts(File &file, unsigned order,						unsigned *minCounts){    VocabString words[maxNgramOrder + 1];    VocabIndex prefix[maxNgramOrder + 1];    /*     * Data for tracking deletable prefixes     */    LHash<VocabIndex, CountT> *metaCounts;        metaCounts = new LHash<VocabIndex, CountT>[order];    assert(metaCounts != 0);    makeArray(Boolean, haveCounts, order);    makeArray(VocabIndex *, lastPrefix, order);    for (unsigned i = 0; i < order; i ++) {	lastPrefix[i] = new VocabIndex[order + 1];	assert(lastPrefix[i] != 0);	lastPrefix[i][0] = Vocab_None;	haveCounts[i] = false;    }    Vocab::compareVocab = 0;		// do comparison by word index    CountT count;    unsigned int howmany;    while (howmany = readNgram(file, words, maxNgramOrder + 1, count)) {	/*	 * Skip this entry if the length of the ngram exceeds our 	 * maximum order	 */	if (howmany > order) {	    continue;	}	VocabIndex metaTag = Vocab_None;	Boolean haveRealCount = false;	/*	 * Discard counts below mincount threshold	 */	if (count < minCounts[howmany - 1]) {	    if ((unsigned)count == 0) {		continue;	    } else {		/*		 * See if metatag is defined, and if so, keep going		 */	    	metaTag = vocab.metaTagOfType((unsigned)count);		if (metaTag == Vocab_None) {		    continue;		}	    }	}	/* 	 * Map words to indices (includes both N-gram prefix and last word)	 */	if (openVocab) {	    vocab.addWords(words, prefix, maxNgramOrder);	} else {	    vocab.getIndices(words, prefix, maxNgramOrder, vocab.unkIndex());	}	/* 	 * Truncate Ngram for prefix comparison	 */	VocabIndex lastWord = prefix[howmany - 1];	prefix[howmany - 1] = Vocab_None;	/*	 * If current ngram prefix differs from previous one assume that 	 * we are done with all ngrams of that prefix (this assumption is 	 * true for counts in prefix order).	 * In this case, output saved meta-counts for the previous prefix,	 * but only if there also were some real counts -- this saves a lot	 * of space in the counts tree.	 */	if (Vocab::compare(prefix, lastPrefix[howmany-1]) != 0) {	    if (haveCounts[howmany-1]) {		addCounts(lastPrefix[howmany-1], metaCounts[howmany-1]);		haveCounts[howmany-1] = false;	    }	    metaCounts[howmany-1].clear();	    Vocab::copy(lastPrefix[howmany-1], prefix);	}	/*	 *  Update the count (either meta or real)	 */	if (metaTag != Vocab_None) {	    *metaCounts[howmany-1].insert(metaTag) += 1;	} else {	    *insertCount(prefix, lastWord) += count;	    haveCounts[howmany-1] = true;	}    }    /*     * Record final saved meta-counts and deallocate prefix buffers     */    for (unsigned i = order; i > 0; i --) {	if (haveCounts[i-1]) {	    addCounts(lastPrefix[i-1], metaCounts[i-1]);	}	delete [] lastPrefix[i-1];    }    delete [] metaCounts;    /*     * XXX: always return true for now, should return false if there was     * a format error in the input file.     */    return true;}template <class CountT>unsigned intNgramCounts<CountT>::writeNgram(File &file,		       const VocabString *words,		       CountT count){    unsigned int i;    if (words[0]) {	fprintf(file, "%s", words[0]);	for (i = 1; words[i]; i++) {	    fprintf(file, " %s", words[i]);	}    }    fprintf(file, "\t%s\n", countToString(count));    return i;}/* * For reasons of efficiency the write() method doesn't use * writeNgram()  (yet).  Instead, it fills up a string buffer  * as it descends the tree recursively.  this avoid having to * lookup shared prefix words and buffer the corresponding strings * repeatedly. */template <class CountT>voidNgramCounts<CountT>::writeNode(    NgramNode *node,		/* the trie node we're at */    File &file,			/* output file */    char *buffer,		/* output buffer */    char *bptr,			/* pointer into output buffer */    unsigned int level,		/* current trie level */    unsigned int order,		/* target trie level */    Boolean sorted)		/* produce sorted output */{    NgramNode *child;    VocabIndex wid;    TrieIter<VocabIndex,CountT> iter(*node, sorted ? vocab.compareIndex() : 0);    /*     * Iterate over the child nodes at the current level,     * appending their word strings to the buffer     */    while (!file.error() && (child = iter.next(wid))) {	VocabString word = vocab.getWord(wid);	if (word == 0) {	   cerr << "undefined word index " << wid << "\n";	   continue;	}	unsigned wordLen = strlen(word);	if (bptr + wordLen + 1 > buffer + maxLineLength) {	   *bptr = '0';	   cerr << "ngram ["<< buffer << word		<< "] exceeds write buffer\n";	   continue;	}        	strcpy(bptr, word);	/*	 * If this is the final level, print out the ngram and the count.	 * Otherwise set up another level of recursion.	 */	if (order == 0 || level == order) {	   fprintf(file, "%s\t%s\n", buffer, countToString(child->value()));	} 		if (order == 0 || level < order) {	   *(bptr + wordLen) = ' ';	   writeNode(child, file, buffer, bptr + wordLen + 1, level + 1,			order, sorted);	}    }}template <class CountT>voidNgramCounts<CountT>::write(File &file, unsigned int order, Boolean sorted){    static char buffer[maxLineLength];    writeNode(&counts, file, buffer, buffer, 1, order, sorted);}template <class CountT>CountTNgramCounts<CountT>::sumNode(NgramNode *node, unsigned int level, unsigned int order){    /*     * For leaf nodes, or nodes beyond the maximum level we are summing,     * return their count, leaving it unchanged.     * For nodes closer to the root, replace their counts with the     * the sum of the counts of all the children.     */    if (level > order || node->numEntries() == 0) {	return node->value();    } else {	NgramNode *child;	TrieIter<VocabIndex,CountT> iter(*node);	VocabIndex wid;	CountT sum = 0;	while (child = iter.next(wid)) {	    sum += sumNode(child, level + 1, order);	}	node->value() = sum;	return sum;    }}template <class CountT>CountTNgramCounts<CountT>::sumCounts(unsigned int order){    return sumNode(&counts, 1, order);}/* * Prune ngram counts */template <class CountT>unsignedNgramCounts<CountT>::pruneCounts(CountT minCount){    unsigned npruned = 0;    makeArray(VocabIndex, ngram, order + 1);    for (unsigned i = 1; i <= order; i++) {	CountT *count;	NgramCountsIter<CountT> countIter(*this, ngram, i);	/*	 * This enumerates all ngrams	 */	while (count = countIter.next()) {	    if (*count < minCount) {		removeCount(ngram);		npruned ++;	    }	}    }    return npruned;}#endif /* _NgramStats_cc_ */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -