⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fngramspecs.h

📁 这是一款很好用的工具包
💻 H
📖 第 1 页 / 共 2 页
字号:
/* * FngramSpecs.h -- *  *	structure to store the specifications of a factored LM * *  Jeff Bilmes  <bilmes@ee.washington.edu> *  * @(#)$Header: /home/srilm/devel/flm/src/RCS/FNgramSpecs.h,v 1.9 2006/01/09 18:30:49 stolcke Exp $ * */#ifndef _FNgramSpecs_h_#define _FNgramSpecs_h_#ifndef EXCLUDE_CONTRIB#include <iostream>using namespace std;#include <stdio.h>#include "LHash.h"#include "Trie.h"#include "Array.h"#include "LMStats.h"#include "FactoredVocab.h"/* * Debug levels used */#define DEBUG_TAG_WARNINGS   8#define DEBUG_VERY_VERBOSE   10#define DEBUG_EVERY_SENTENCE_INFO   12#define DEBUG_EXTREME        20#define DEBUG_WARN_DUP_TAG   1#define DEBUG_BG_PRINT 1#define DEBUG_MISSING_FIRST_LAST_WORD   1const unsigned int maxNumParentsPerChild = 32;const unsigned int maxExtraWordsPerLine = 3;// output files with these names are never writtenconst VocabString FNGRAM_DEV_NULL_FILE = "_";#define FNgramNode	Trie<VocabIndex,CountT>template <class CountT> class FNgramCounts;	// forward declarationclass FDiscount;class FNgram;class WidMatrix;class WordMatrix;enum BackoffNodeStrategy {   CountsNoNorm=0,           // use absolute counts  CountsSumCountsNorm=1,    // norm by sum counts at level (== event MI maximization)  CountsSumNumWordsNorm=2,  // norm by num words  CountsProdCardinalityNorm=3,  // norma by prod. cardinality of gram  CountsSumCardinalityNorm=4,   // norma by sum. cardinality of gram  CountsSumLogCardinalityNorm=5, // norma by sum log cardinality of gram  BogNodeProb=6 // backoff graph node probability  // To add your favorite strategy here, add code 1) to boNode() in  // FNgramLM.cc, 2) to backoffValueRSubCtxW(), and 3) to the node  // option reading code in constructor for FNgramSpecs. Also make sure  // to check the (great)grandchild score summing code in   // FNgram::boNode() to make sure that it still makes sense with your  // strategy.};// combination methodsenum BackoffNodeCombine {   MaxBgChild=0,  MinBgChild=1,  SumBgChild=2,  ProdBgChild=3,  AvgBgChild=4,  GmeanBgChild=5,  WmeanBgChild=6};// useful defines.const char FNGRAM_WORD_TAG = 'W';const VocabString FNGRAM_WORD_TAG_STR = "W";const unsigned FNGRAM_WORD_TAG_STR_LEN = 2;// stuff will break if next const is changed to something other than 0const unsigned FNGRAM_WORD_TAG_POS = 0;const VocabString FNGRAM_WORD_TAG_SEP_STR = "-";const char FNGRAM_WORD_TAG_SEP = '-';const VocabString FNGRAM_WORD_TAG_NULL_SPEC_STR = Vocab_NULL;const char FNGRAM_FACTOR_SEPARATOR = ':';const VocabString FNGRAM_FACTOR_SEPARATOR_STR = ":";template <class CountT>class FNgramSpecs : public Debug {  friend class FNgram;  friend class FactoredVocab;  FactoredVocab& fvocab;  // the template&friend stuff does not seem to work for the gcc 2.95.3  // so we make everything public for now.public:  // data structure for specifying which factored forms are to be  // counted  struct FNgramSpec {    friend class FactoredVocab;    FNgramSpec() { child = countFileName = NULL;                   numParents = 0; }    VocabString child;    unsigned childPosition;    unsigned numParents;    unsigned numSubSets;    Array<VocabString> parents;    Array<unsigned> parentPositions;    Array<int> parentOffsets;     unsigned int parseNodeString(char *str,Boolean& success);    Boolean printNodeString(FILE* f,unsigned int node);    // for each set of subsets of parents (all 2^N of them, where    // N is the number of parents), (i.e., each node) we have the following    // counts-like structure.    struct ParentSubset {      // 1. the "n-gram" for each parent subset has its own counts      FNgramNode* counts;      // Here, the "order" in the sence that P(C|P1,P2,...,PN) is      // a distribution, then order = (N+1), so normal notion of N-gram order,      // but for an arbitrary set of parents. order >= 1      unsigned int order;      // Number of BG children, but based on the constraints (i.e., for      // certain constraints, some BG children might not be used and      // have counts == NULL). This must be >= 1 to have a well-defined      // model.      unsigned int numBGChildren;      // Discount object for this node.      FDiscount *discount;      // options for this node, pretty much the same thing you'll find      // on the command line of ngram-counts.cc      unsigned int backoffConstraint;      BackoffNodeStrategy backoffStrategy;      BackoffNodeCombine backoffCombine;      unsigned int gtmin;      unsigned int gtmax;      char *gtFile;      double cdiscount;      Boolean ndiscount;      Boolean wbdiscount;      Boolean kndiscount;      Boolean ukndiscount;      char *knFile;      Boolean knCountsModified;      Boolean knCountsModifyAtEnd;      unsigned int knCountParent;      Boolean interpolate;      char *writeFile;      // counts file for this node      LogP2 *wmean;      double prodCardinalities;      double sumCardinalities;      double sumLogCardinalities;      Boolean requiresGenBackoff() {	return !((numBGChildren <= 1)		 ||		 (backoffCombine == AvgBgChild)		 || 		 (backoffCombine == WmeanBgChild));      }      // initialize to some sensible values, gt will be default.      ParentSubset() { 	counts = NULL;	order = 0;	numBGChildren = 0;	discount = NULL;	backoffConstraint = ~0x0;	backoffStrategy = CountsSumCountsNorm;	backoffCombine = MaxBgChild;	// TODO: make it possible for these defaults to be node dependent	//       like in ngram-counts.cc	gtmin = 1;	gtmax = 7;	gtFile = 0;	cdiscount = -1.0;	ndiscount = false;	wbdiscount = false;	kndiscount = false;	ukndiscount = false;	knFile = NULL;	knCountsModified = false;	knCountsModifyAtEnd = false;	knCountParent = ~0x0;	interpolate = false;	writeFile = NULL;	wmean = NULL;      };      /*       * Individual word/ngram lookup and insertion,       * caller should make sure counts != NULL.       */      CountT *findCount(const VocabIndex *words)          { return counts->find(words); };      CountT *findCount(const VocabIndex *words, VocabIndex word1)          { FNgramNode *node = counts->findTrie(words);	    return node ? node->find(word1) : 0; }      CountT *findCountR(const VocabIndex *words, VocabIndex word1);      // Fast routines which are used given a BG-parent node context      // but we wish to find the count of a child, given by the bit vector.      // These routines should probably live in Trie.{cc,h}.      CountT *findCountSubCtx(const VocabIndex *words, unsigned int bits = ~0x0);      CountT *findCountSubCtxW(const VocabIndex *words, VocabIndex word1,			      unsigned int bits = ~0x0);      // Same as above two, but indexes into words in the reverse order. This      // is because LMs use wids in Tries in reverse order. These      // routines give faster count trie lookups when wids are in reversed order      // rather than having to reverse the wids list first.      CountT *findCountRSubCtx(const VocabIndex *words, unsigned int bits = ~0x0);      CountT *findCountRSubCtxW(const VocabIndex *words, VocabIndex word1,				unsigned int bits = ~0x0);      // return a value giving a "score" for the backoff context.      // the caller might want to maximize that score to choose which context to use.      double backoffValueRSubCtxW(VocabIndex word1,				  const VocabIndex*words,				  unsigned int nWrtwCip,				  BackoffNodeStrategy parentsStrategy,				  FNgram& fngram,				  unsigned int specNum,				  unsigned int node);      // other routines for manipulating count objects.      CountT *insertCount(const VocabIndex *words)          { return counts->insert(words); };      CountT *insertCount(const VocabIndex *words, VocabIndex word1)          { FNgramNode *node = counts->insertTrie(words);	    return node->insert(word1); };      CountT *removeCount(const VocabIndex *words)          { return counts->remove(words); };      CountT *removeCount(const VocabIndex *words, VocabIndex word1)          { FNgramNode *node = counts->findTrie(words);	    return node ? node->remove(word1) : 0; };      CountT accumulateCounts(FNgramNode* counts);      CountT accumulateCounts()      { return accumulateCounts(counts); }    };        // iteration over counts entries in a parent subset object    // Note that since count tries in this case only store counts for a given    // level, we can get the "order" directly from the object itself. We    // still give the option, however, to give a different order and iterate    // over a context even though all of those counts will always be zero    // (this is necessary in LM estimation)    class PSIter {    public:      /* all ngrams of length order, starting       * at root */      PSIter(ParentSubset& pss, 	     VocabIndex *keys,	     unsigned order = ~0x0,	     int (*sort)(VocabIndex,VocabIndex) = 0) {	if (order == ~0x0) order = pss.order;	if (pss.counts == NULL) {	  myIter = NULL;	} else {	  myIter = new TrieIter2<VocabIndex,CountT>(*(pss.counts), keys, order, sort);	}      }      /* all count grams of length order for this parent subset, rooted       * at node indexed by start */          PSIter(ParentSubset& pss,	     const VocabIndex *start,	     VocabIndex *keys, 	     unsigned order = ~0x0,	     int (*sort)(VocabIndex, VocabIndex) = 0) {	if (order == ~0x0) order = pss.order;	if (pss.counts == NULL) {	  myIter = NULL;	} else {	  myIter = new TrieIter2<VocabIndex,CountT>(*(pss.counts->insertTrie(start)), 						    keys,order,sort);	}      }      ~PSIter() { delete myIter; }      void init() { if (myIter) myIter->init(); }      CountT *next()      { if (!myIter) return 0;      FNgramNode *node = myIter->next();      return node ? &(node->value()) : 0; }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -