📄 fngramstats.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * FNgramStats.cc -- *	Cross-stream N-gram counting * * Jeff Bilmes <bilmes@ee.washington.edu> * Kevin Duh <duh@ee.washington.edu> * */#ifndef _FNgramStats_cc_#define _FNgramStats_cc_#ifndef lintstatic char FNgramStats_Copyright[] = "Copyright (c) 1995-2006 SRI International.  All Rights Reserved.";static char FNgramStats_RcsId[] = "@(#)$Header: /home/srilm/devel/flm/src/RCS/FNgramStats.cc,v 1.10 2006/01/05 20:21:27 stolcke Exp $";#endif#ifndef EXCLUDE_CONTRIB#include <iostream>using namespace std;#include <string.h>#include <ctype.h>const unsigned maxLineLength = 10000;#include "FNgramStats.h"#include "LHash.cc"#include "Trie.cc"#include "Array.cc"#include "FactoredVocab.h"#define INSTANTIATE_FNGRAMCOUNTS(CountT) \	template class FNgramCounts<CountT>/* * Debug levels used */#define DEBUG_VERY_VERBOSE   10#define DEBUG_WARNINGS   1#define DEBUG_PRINT_TEXTSTATS	1static inline unsigned intnumBitsSet(register unsigned u) {  register unsigned count=0;  while (u) {    count += (u&0x1);    u >>= 1;  }  return count;}static inline unsigned intbackoffGraphLevel(unsigned numParents, unsigned parentSubSet){  const unsigned int nbs = numBitsSet(parentSubSet);  assert (numParents >= nbs);  return (numParents-nbs);}/* * return true if, given the mask, we should expand * the parentSubset path. */static inline BooleanexpandBackoff(unsigned mask, unsigned parentSubSet){  // true if all zero bits in parentSubSet have  // a 1 in corresponding position in mask  return ((~parentSubSet & mask) != 0);}template <class CountT>voidFNgramCounts<CountT>::dump(){  fprintf(stderr,"FNgramCounts<CountT>::dump() undefined\n");  exit(-1);  // cerr << "order = " << order << endl;  //     counts.dump();}template <class CountT>voidFNgramCounts<CountT>::memStats(MemStats &stats){  // TODO: update this object to reflect the new mem.  // stats.total += sizeof(*this) - sizeof(counts) - sizeof(vocab);  // vocab.memStats(stats);  // counts.memStats(stats);  fprintf(stderr, "FNgramCounts<CountT>::memStats(MemStats &stats) needs to be implemented\n:");}/* * * FNgramCounts constructor * */template <class CountT>FNgramCounts<CountT>::FNgramCounts(FactoredVocab &vocab,				   FNgramSpecs<CountT>& fngs)  : LMStats(vocab),fnSpecs(fngs),virtualBeginSentence(true),virtualEndSentence(true),    addStartSentenceToken(true),addEndSentenceToken(true), vocab(vocab){  LHashIter<VocabString,unsigned> tags(fnSpecs.tagPosition);  VocabString tag;  unsigned *pos;  while ((pos = tags.next(tag)) != NULL) {    vocab.addTaggedIndices(tag,FNGRAM_WORD_TAG_SEP_STR);  }  vocab.createTagSubVocabs(fnSpecs.tagPosition);}template <class CountT>unsigned intFNgramCounts<CountT>::countSentence(const VocabString *words, CountT factor){    static WordMatrix wordMatrix;    static WidMatrix widMatrix;    unsigned int howmany;    howmany = fnSpecs.loadWordFactors(words,wordMatrix,maxWordsPerLine + 1);    /*     * Check for buffer overflow or no data     */    if (howmany == maxWordsPerLine + 1 || howmany == 0) {	return 0;    }    ::memset(widMatrix[0],0,(maxNumParentsPerChild+1)*sizeof(VocabIndex));    if (openVocab) {      for (int i=0;i<(int)howmany;i++) {	::memset(widMatrix[i+1],0,(maxNumParentsPerChild+1)*sizeof(VocabIndex));	vocab.addWords(wordMatrix[i],widMatrix[i+1],maxNumParentsPerChild+1);	for (int j=0;widMatrix[i+1][j] != Vocab_None;j++) {	  (*((FactoredVocab*)(&vocab))).addTagWord(j,widMatrix[i+1][j]);	}      }    } else {      for (int i=0;i<(int)howmany;i++) {	::memset(widMatrix[i+1],0,(maxNumParentsPerChild+1)*sizeof(VocabIndex));	vocab.getIndices(wordMatrix[i],widMatrix[i+1],maxNumParentsPerChild+1,			 vocab.unkIndex());	// TODO: while this code is finished for closed vocab, we need	// still to update FactoredVocab::read() before closed vocab	// stuff will work.	for (int j=0;widMatrix[i+1][j] != Vocab_None;j++) {	  if (widMatrix[i+1][j] == vocab.unkIndex()) {	    // NOTE: since we use the same global index set for all	    // words even if they're in different factors, you might	    // think it would be possible for an unknown word in one	    // factor to be known in the other and thereby give a real	    // word index for the factor in which that word is	    // unknown. We assume here, however, that all tokens in	    // the word file are either without tags or with the	    // special word tag "W-" (so are regularly words), or have	    // a unique tag that identifies them. Therefore, this	    // problem shouldn't happen.	    // need to update all LMs that have this index as a child.	    for (int lm=0;lm<(int)fnSpecs.fnSpecArray.size();lm++) {	      if (j == fnSpecs.fnSpecArray[lm].childPosition)		fnSpecs.fnSpecArray[lm].stats.numOOVs++;	    }	  }	}      }    }    /*     * update OOV count, we need to do this for all factored     * models that use the tag as a child.     */    if (!openVocab) {	for (unsigned i = 1; i <= howmany; i++) {	  for (unsigned j=0;j<fnSpecs.tagPosition.numEntries();j++) {	  	    if (widMatrix[i][j] == vocab.unkIndex()) {	      // TODO: update OOV counts here.	    }	  }	}    }    /*     * Insert begin/end sentence tokens if necessary     */    unsigned start;    unsigned end;    if (widMatrix[1][FNGRAM_WORD_TAG_POS] == vocab.ssIndex()) {	start = 1;	// TODO: note that if other factors at word start sentence index position	// will be destroyed here, all being forced to start sentence.	for (unsigned j=(FNGRAM_WORD_TAG_POS+1);j<fnSpecs.tagPosition.numEntries();j++) {	  widMatrix[1][j] = vocab.ssIndex();	}    } else if (addStartSentenceToken) {	start = 0;	widMatrix[0][FNGRAM_WORD_TAG_POS] = vocab.ssIndex();	for (unsigned j=(FNGRAM_WORD_TAG_POS+1);j<fnSpecs.tagPosition.numEntries();j++) {	  widMatrix[0][j] = vocab.ssIndex();	}    } else {      // just use what is given in the text file.      start = 1;    }    if ((widMatrix[howmany][FNGRAM_WORD_TAG_POS] != vocab.seIndex())	&&	addEndSentenceToken) {	for (unsigned j=FNGRAM_WORD_TAG_POS;j<fnSpecs.tagPosition.numEntries();j++) {	  widMatrix[howmany+1][j] = vocab.seIndex();	  widMatrix[howmany+2][j] = Vocab_None;	}	end = howmany+1;    } else {      // this is not done above, so we do it here.      for (unsigned j=0;j<fnSpecs.tagPosition.numEntries();j++) {	widMatrix[howmany+1][j] = Vocab_None;      }      end = howmany;    }    return countSentence(start, end, widMatrix, factor);}/* * Incrememt counts indexed by words, starting at node. * * Yes, Trie structures are not the most space and compute efficient way * to do this here, but they're easy to use and since they exist now, we * use them. * *TODO*: optimize this class to use a data-structure that is better suited * to the backoff-graph. One simple solution would be to use Trie structures * that have a count value only at the leaf nodes rather than using space * on thouse counts at each non-leaf and leaf node. (on the other hand, * perhaps this space will be useful in new forms of kn smoothing). *  */template <class CountT>voidFNgramCounts<CountT>::incrementCounts(FNgramNode* counts,				      const VocabIndex *words,				      const unsigned order,				      const unsigned minOrder,				      const CountT factor){  FNgramNode *node = counts;  for (int i = 0; i <(int) order; i++) {    VocabIndex wid = words[i];    /*     * check of end-of-sentence     */    if (wid == Vocab_None) {      break;    } else {      node = node->insertTrie(wid);      if (i + 1 >= (int)minOrder) {	node->value() += factor;      }    }  }}template <class CountT>unsigned intFNgramCounts<CountT>::countSentence(const unsigned int start, // first valid token				    const unsigned int end,   // last valid token				    WidMatrix& wm,				    CountT factor){  static VocabIndex wids[maxNumParentsPerChild + 2];    if (debug(DEBUG_VERY_VERBOSE))      fprintf(stderr,"counting sentence\n");  // if tag in .flm file is mentioned, but is not in file, then give  // warning.    unsigned wordNum;  for (wordNum=start; wm[wordNum][FNGRAM_WORD_TAG_POS] != Vocab_None; wordNum++) {    for (int j=0;j<(int)fnSpecs.fnSpecArray.size();j++) {      // TODO: for efficiency, change these loops so that wids is created in       // reverse order (thereby reducing number of writes)      // and then calling a version of incrementCounts() that takes      // a wids in reverse order.      // OR: use a level iter in reverse level order (from 0 to max).      // Alternatively, create version of incrementCounts that      // take a bitvector saying which word it should use, thereby      // not using so many writes to wids.      const int numSubSets = 1<<fnSpecs.fnSpecArray[j].numParents;      for (int k=0;k<numSubSets;k++) {	if (fnSpecs.fnSpecArray[j].parentSubsets[k].counts == NULL)	  continue;	int wid_index = 0;	for (int l=fnSpecs.fnSpecArray[j].numParents-1;l>=0;l--) {	  if (k & (1<<l)) {	    if (fnSpecs.fnSpecArray[j].parentOffsets[l] + (int)wordNum < (int)start) {	      // NOTE: note that adding start sentence here will add	      // more contexts than would exist for normal trigram	      // models (i.e. Wt-2, Wt-1, Wt when t=0 would give a	      // context <s> <s> w. This is harmless, in the word	      // case.  Note that we need to do this though, because	      // for kn smoothing we potentially need to have all BG	      // parents have the appropriate counts for the child.	      // NOTE: We do not get the *exact* same perplexity as	      // ngram-count/ngram on word-only data unless we add in	      // the following line here in the code:	      if (!virtualBeginSentence)		goto nextSubSet;	      // To get the exact same perplexity, take the goto	      // statement, and run with "-nonull" (and make sure all	      // gtmin options are the same)	      wids[wid_index++] = vocab.ssIndex();	    } else if (fnSpecs.fnSpecArray[j].parentOffsets[l] + (int)wordNum > (int)end) {	      if (!virtualEndSentence)		goto nextSubSet;	      wids[wid_index++] = vocab.seIndex();	    } else {	      wids[wid_index++] = 		wm[wordNum + fnSpecs.fnSpecArray[j].parentOffsets[l]]		[fnSpecs.fnSpecArray[j].parentPositions[l]];	    }	  }	}	wids[wid_index++] = wm[wordNum][fnSpecs.fnSpecArray[j].childPosition];	wids[wid_index] = Vocab_None;	assert (wid_index == fnSpecs.fnSpecArray[j].parentSubsets[k].order);	incrementCounts(fnSpecs.fnSpecArray[j].parentSubsets[k].counts,			wids,			fnSpecs.fnSpecArray[j].parentSubsets[k].order,			fnSpecs.fnSpecArray[j].parentSubsets[k].order,			factor);      nextSubSet:	;      }          }  }  for (int j=0;j<(int)fnSpecs.fnSpecArray.size();j++) {    // we subtract off beginning (start) and potentially more for    // start/end of sentence tokens.    // NOTE: NgramCounts<CountT>::countSentence() checks the    // ssIndex/seIndex condition here so we do the same.#if 0    unsigned numWords = wordNum - start;    if (wm[start][FNGRAM_WORD_TAG_POS] == vocab.ssIndex())      numWords--;    if (wordNum > 0 && wm[wordNum-1][FNGRAM_WORD_TAG_POS] == vocab.seIndex())      numWords--;        fnSpecs.fnSpecArray[j].stats.numWords += numWords;#endif    fnSpecs.fnSpecArray[j].stats.numWords += (wordNum - start - 1);    fnSpecs.fnSpecArray[j].stats.numSentences++;  }  if (debug(DEBUG_VERY_VERBOSE))    fprintf(stderr,"done counting sentence\n");  return wordNum;}/***************************************************************************************** *****************************************************************************************  *****************************************************************************************//* * Type-dependent count <--> string conversions *///#ifdef INSTANTIATE_TEMPLATESstatic//#endifchar ctsBuffer[100];template <class CountT>static inline const char *countToString(CountT count){    sprintf(ctsBuffer, "%lg", (double)count);    return ctsBuffer;}static inline const char *countToString(unsigned count){    sprintf(ctsBuffer, "%u", count);    return ctsBuffer;}static inline const char *countToString(int count){    sprintf(ctsBuffer, "%d", count);    return ctsBuffer;
12 下一页
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -