📄 fngramstats.cc
字号:
/* * FNgramStats.cc -- * Cross-stream N-gram counting * * Jeff Bilmes <bilmes@ee.washington.edu> * Kevin Duh <duh@ee.washington.edu> * */#ifndef _FNgramStats_cc_#define _FNgramStats_cc_#ifndef lintstatic char FNgramStats_Copyright[] = "Copyright (c) 1995-2006 SRI International. All Rights Reserved.";static char FNgramStats_RcsId[] = "@(#)$Header: /home/srilm/devel/flm/src/RCS/FNgramStats.cc,v 1.10 2006/01/05 20:21:27 stolcke Exp $";#endif#ifndef EXCLUDE_CONTRIB#include <iostream>using namespace std;#include <string.h>#include <ctype.h>const unsigned maxLineLength = 10000;#include "FNgramStats.h"#include "LHash.cc"#include "Trie.cc"#include "Array.cc"#include "FactoredVocab.h"#define INSTANTIATE_FNGRAMCOUNTS(CountT) \ template class FNgramCounts<CountT>/* * Debug levels used */#define DEBUG_VERY_VERBOSE 10#define DEBUG_WARNINGS 1#define DEBUG_PRINT_TEXTSTATS 1static inline unsigned intnumBitsSet(register unsigned u) { register unsigned count=0; while (u) { count += (u&0x1); u >>= 1; } return count;}static inline unsigned intbackoffGraphLevel(unsigned numParents, unsigned parentSubSet){ const unsigned int nbs = numBitsSet(parentSubSet); assert (numParents >= nbs); return (numParents-nbs);}/* * return true if, given the mask, we should expand * the parentSubset path. */static inline BooleanexpandBackoff(unsigned mask, unsigned parentSubSet){ // true if all zero bits in parentSubSet have // a 1 in corresponding position in mask return ((~parentSubSet & mask) != 0);}template <class CountT>voidFNgramCounts<CountT>::dump(){ fprintf(stderr,"FNgramCounts<CountT>::dump() undefined\n"); exit(-1); // cerr << "order = " << order << endl; // counts.dump();}template <class CountT>voidFNgramCounts<CountT>::memStats(MemStats &stats){ // TODO: update this object to reflect the new mem. // stats.total += sizeof(*this) - sizeof(counts) - sizeof(vocab); // vocab.memStats(stats); // counts.memStats(stats); fprintf(stderr, "FNgramCounts<CountT>::memStats(MemStats &stats) needs to be implemented\n:");}/* * * FNgramCounts constructor * */template <class CountT>FNgramCounts<CountT>::FNgramCounts(FactoredVocab &vocab, FNgramSpecs<CountT>& fngs) : LMStats(vocab),fnSpecs(fngs),virtualBeginSentence(true),virtualEndSentence(true), addStartSentenceToken(true),addEndSentenceToken(true), vocab(vocab){ LHashIter<VocabString,unsigned> tags(fnSpecs.tagPosition); VocabString tag; unsigned *pos; while ((pos = tags.next(tag)) != NULL) { vocab.addTaggedIndices(tag,FNGRAM_WORD_TAG_SEP_STR); } vocab.createTagSubVocabs(fnSpecs.tagPosition);}template <class CountT>unsigned intFNgramCounts<CountT>::countSentence(const VocabString *words, CountT factor){ static WordMatrix wordMatrix; static WidMatrix widMatrix; unsigned int howmany; howmany = fnSpecs.loadWordFactors(words,wordMatrix,maxWordsPerLine + 1); /* * Check for buffer overflow or no data */ if (howmany == maxWordsPerLine + 1 || howmany == 0) { return 0; } ::memset(widMatrix[0],0,(maxNumParentsPerChild+1)*sizeof(VocabIndex)); if (openVocab) { for (int i=0;i<(int)howmany;i++) { ::memset(widMatrix[i+1],0,(maxNumParentsPerChild+1)*sizeof(VocabIndex)); vocab.addWords(wordMatrix[i],widMatrix[i+1],maxNumParentsPerChild+1); for (int j=0;widMatrix[i+1][j] != Vocab_None;j++) { (*((FactoredVocab*)(&vocab))).addTagWord(j,widMatrix[i+1][j]); } } } else { for (int i=0;i<(int)howmany;i++) { ::memset(widMatrix[i+1],0,(maxNumParentsPerChild+1)*sizeof(VocabIndex)); vocab.getIndices(wordMatrix[i],widMatrix[i+1],maxNumParentsPerChild+1, vocab.unkIndex()); // TODO: while this code is finished for closed vocab, we need // still to update FactoredVocab::read() before closed vocab // stuff will work. for (int j=0;widMatrix[i+1][j] != Vocab_None;j++) { if (widMatrix[i+1][j] == vocab.unkIndex()) { // NOTE: since we use the same global index set for all // words even if they're in different factors, you might // think it would be possible for an unknown word in one // factor to be known in the other and thereby give a real // word index for the factor in which that word is // unknown. We assume here, however, that all tokens in // the word file are either without tags or with the // special word tag "W-" (so are regularly words), or have // a unique tag that identifies them. Therefore, this // problem shouldn't happen. // need to update all LMs that have this index as a child. for (int lm=0;lm<(int)fnSpecs.fnSpecArray.size();lm++) { if (j == fnSpecs.fnSpecArray[lm].childPosition) fnSpecs.fnSpecArray[lm].stats.numOOVs++; } } } } } /* * update OOV count, we need to do this for all factored * models that use the tag as a child. */ if (!openVocab) { for (unsigned i = 1; i <= howmany; i++) { for (unsigned j=0;j<fnSpecs.tagPosition.numEntries();j++) { if (widMatrix[i][j] == vocab.unkIndex()) { // TODO: update OOV counts here. } } } } /* * Insert begin/end sentence tokens if necessary */ unsigned start; unsigned end; if (widMatrix[1][FNGRAM_WORD_TAG_POS] == vocab.ssIndex()) { start = 1; // TODO: note that if other factors at word start sentence index position // will be destroyed here, all being forced to start sentence. for (unsigned j=(FNGRAM_WORD_TAG_POS+1);j<fnSpecs.tagPosition.numEntries();j++) { widMatrix[1][j] = vocab.ssIndex(); } } else if (addStartSentenceToken) { start = 0; widMatrix[0][FNGRAM_WORD_TAG_POS] = vocab.ssIndex(); for (unsigned j=(FNGRAM_WORD_TAG_POS+1);j<fnSpecs.tagPosition.numEntries();j++) { widMatrix[0][j] = vocab.ssIndex(); } } else { // just use what is given in the text file. start = 1; } if ((widMatrix[howmany][FNGRAM_WORD_TAG_POS] != vocab.seIndex()) && addEndSentenceToken) { for (unsigned j=FNGRAM_WORD_TAG_POS;j<fnSpecs.tagPosition.numEntries();j++) { widMatrix[howmany+1][j] = vocab.seIndex(); widMatrix[howmany+2][j] = Vocab_None; } end = howmany+1; } else { // this is not done above, so we do it here. for (unsigned j=0;j<fnSpecs.tagPosition.numEntries();j++) { widMatrix[howmany+1][j] = Vocab_None; } end = howmany; } return countSentence(start, end, widMatrix, factor);}/* * Incrememt counts indexed by words, starting at node. * * Yes, Trie structures are not the most space and compute efficient way * to do this here, but they're easy to use and since they exist now, we * use them. * *TODO*: optimize this class to use a data-structure that is better suited * to the backoff-graph. One simple solution would be to use Trie structures * that have a count value only at the leaf nodes rather than using space * on thouse counts at each non-leaf and leaf node. (on the other hand, * perhaps this space will be useful in new forms of kn smoothing). * */template <class CountT>voidFNgramCounts<CountT>::incrementCounts(FNgramNode* counts, const VocabIndex *words, const unsigned order, const unsigned minOrder, const CountT factor){ FNgramNode *node = counts; for (int i = 0; i <(int) order; i++) { VocabIndex wid = words[i]; /* * check of end-of-sentence */ if (wid == Vocab_None) { break; } else { node = node->insertTrie(wid); if (i + 1 >= (int)minOrder) { node->value() += factor; } } }}template <class CountT>unsigned intFNgramCounts<CountT>::countSentence(const unsigned int start, // first valid token const unsigned int end, // last valid token WidMatrix& wm, CountT factor){ static VocabIndex wids[maxNumParentsPerChild + 2]; if (debug(DEBUG_VERY_VERBOSE)) fprintf(stderr,"counting sentence\n"); // if tag in .flm file is mentioned, but is not in file, then give // warning. unsigned wordNum; for (wordNum=start; wm[wordNum][FNGRAM_WORD_TAG_POS] != Vocab_None; wordNum++) { for (int j=0;j<(int)fnSpecs.fnSpecArray.size();j++) { // TODO: for efficiency, change these loops so that wids is created in // reverse order (thereby reducing number of writes) // and then calling a version of incrementCounts() that takes // a wids in reverse order. // OR: use a level iter in reverse level order (from 0 to max). // Alternatively, create version of incrementCounts that // take a bitvector saying which word it should use, thereby // not using so many writes to wids. const int numSubSets = 1<<fnSpecs.fnSpecArray[j].numParents; for (int k=0;k<numSubSets;k++) { if (fnSpecs.fnSpecArray[j].parentSubsets[k].counts == NULL) continue; int wid_index = 0; for (int l=fnSpecs.fnSpecArray[j].numParents-1;l>=0;l--) { if (k & (1<<l)) { if (fnSpecs.fnSpecArray[j].parentOffsets[l] + (int)wordNum < (int)start) { // NOTE: note that adding start sentence here will add // more contexts than would exist for normal trigram // models (i.e. Wt-2, Wt-1, Wt when t=0 would give a // context <s> <s> w. This is harmless, in the word // case. Note that we need to do this though, because // for kn smoothing we potentially need to have all BG // parents have the appropriate counts for the child. // NOTE: We do not get the *exact* same perplexity as // ngram-count/ngram on word-only data unless we add in // the following line here in the code: if (!virtualBeginSentence) goto nextSubSet; // To get the exact same perplexity, take the goto // statement, and run with "-nonull" (and make sure all // gtmin options are the same) wids[wid_index++] = vocab.ssIndex(); } else if (fnSpecs.fnSpecArray[j].parentOffsets[l] + (int)wordNum > (int)end) { if (!virtualEndSentence) goto nextSubSet; wids[wid_index++] = vocab.seIndex(); } else { wids[wid_index++] = wm[wordNum + fnSpecs.fnSpecArray[j].parentOffsets[l]] [fnSpecs.fnSpecArray[j].parentPositions[l]]; } } } wids[wid_index++] = wm[wordNum][fnSpecs.fnSpecArray[j].childPosition]; wids[wid_index] = Vocab_None; assert (wid_index == fnSpecs.fnSpecArray[j].parentSubsets[k].order); incrementCounts(fnSpecs.fnSpecArray[j].parentSubsets[k].counts, wids, fnSpecs.fnSpecArray[j].parentSubsets[k].order, fnSpecs.fnSpecArray[j].parentSubsets[k].order, factor); nextSubSet: ; } } } for (int j=0;j<(int)fnSpecs.fnSpecArray.size();j++) { // we subtract off beginning (start) and potentially more for // start/end of sentence tokens. // NOTE: NgramCounts<CountT>::countSentence() checks the // ssIndex/seIndex condition here so we do the same.#if 0 unsigned numWords = wordNum - start; if (wm[start][FNGRAM_WORD_TAG_POS] == vocab.ssIndex()) numWords--; if (wordNum > 0 && wm[wordNum-1][FNGRAM_WORD_TAG_POS] == vocab.seIndex()) numWords--; fnSpecs.fnSpecArray[j].stats.numWords += numWords;#endif fnSpecs.fnSpecArray[j].stats.numWords += (wordNum - start - 1); fnSpecs.fnSpecArray[j].stats.numSentences++; } if (debug(DEBUG_VERY_VERBOSE)) fprintf(stderr,"done counting sentence\n"); return wordNum;}/***************************************************************************************** ***************************************************************************************** *****************************************************************************************//* * Type-dependent count <--> string conversions *///#ifdef INSTANTIATE_TEMPLATESstatic//#endifchar ctsBuffer[100];template <class CountT>static inline const char *countToString(CountT count){ sprintf(ctsBuffer, "%lg", (double)count); return ctsBuffer;}static inline const char *countToString(unsigned count){ sprintf(ctsBuffer, "%u", count); return ctsBuffer;}static inline const char *countToString(int count){ sprintf(ctsBuffer, "%d", count); return ctsBuffer;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -