📄 fngramspecs.h
字号:
private: TrieIter2<VocabIndex,CountT> *myIter; }; // iteration over counts object above of a given order. // probably not needed. class CountsIter { public: CountsIter(FNgramNode& counts, VocabIndex *keys, unsigned order = 1, int (*sort)(VocabIndex,VocabIndex) = 0) : myIter(counts, keys, order, sort) {}; /* all ngrams of length order, starting * at root */ CountsIter(FNgramNode& counts, const VocabIndex *start, VocabIndex *keys, unsigned order = 1, int (*sort)(VocabIndex, VocabIndex) = 0) : myIter(*(counts.insertTrie(start)), keys, order, sort) {}; /* all ngrams of length order, rooted * at node indexed by start */ void init() { myIter.init(); }; CountT *next() { FNgramNode *node = myIter.next(); return node ? &(node->value()) : 0; }; private: TrieIter2<VocabIndex,CountT> myIter; }; // the set of parent subsets, the array index determines which // parents are used. Array<ParentSubset> parentSubsets; // Iters (defined below) over parent subsets by level, parents, // children, ancestors, and descendants in THE BACKOFF GRAPH // (BG). Note here that by "parents", "children", etc., we do // *NOT* mean the same parents and children that are specified in // the FLM specification file (and what the member variables // 'numParents', 'child', etc. refer to above). Rather, here we // mean parents and children of a node in the backoff graph. This // overloading of terminology is confusing indeed. // // Perhaps we should use "Eltern", "Kinder", "Nachfahren", and // "Vorfahren" for this and stick with English for the above??? As // much as I (JB, obviously not AS) need to practice German, how // about simply "BGParents" for "backoff graph parents", etc. This // was suggested by Katrin (who also speaks German). // // Anyway, here is a simple backoff graph, for a multi-backoff // trigram. The bit vector shown in the node of the graph gives // the set of parents that are used in that node. // // Level 2 11 // / \ // Level 1 10 01 // \ / // Level 0 00 // // Note that, by convention, level numbers *increase* we *add* // bits going down from 00 to 11. The iters and code below use // this convention. // // Here is a BG for a standard trigram p(w_t|w_{t-1},w_{t-2}) // // Level 2 11 // \ // Level 1 01 // / // Level 0 00 // // This means that the first parent (w_{t-1}) correspond to the // low-order bit 0x01, and the second parent (w_{t_2}) corresponds // to the higher order bit 0x10. In general, when we specify in // the factored language model (FLM) file a distribution of the // form f(c|p0,p1,...,pn), where c = child, and pi = i'th parent, // then p0 corresponds to bit 0, p1 to bit 1, etc. // // Here is a BG for a "reverse-context" trigram p(w_t|w_{t-1},w_{t-2}) // // Level 2 11 // / // Level 1 10 // \ // Level 0 00 // // This means that p(W_t=w_t|W_{t-1}=w_{t-1},W_{t-2}=w_{t-2}) // first backs off to p(W_t=w_t|W_{t-2}=w_{t-2}) and then to // p(W_t=w_t). This is different from p(W_t=w_t|W_{t-1}=w_{t-2}) // which would change the history but uses the same distribution // (as similar to SkipNgram.cc) // // // Here is another more interesting case for a four-parent CPT // f(c|p0,p1,p2,p3) (i.e., a 5-gram if they were words). The graph // here only shows the nodes not the edges for clarity. We have an // edge from a node in level_i to a node in level_{i+1} if all // bits in the node in level_{i+1} are contained also in the // parent in level_i. // // // L4 1111(0xF) // // L3 0111(0x7) 1011(0xB) 1101(0xD) 11110(0xE) // // L2 0011(0x3) 0101(0x5) 0110(0x6) 1001(0x9) 1010(0x10) 1100(0xC) // // L1 1000(0x8) 0100(0x4) 0010(0x2) 0001(0x1) // // L0 0000(0x0) // // // Note that since we're using unsigned ints for bitvectors, this means we can not // have more than 32 parents (i.e., 33-grams) for now, assuming a 32-bit machine. // TODO: possibly use an unlimited size bitvector class, but perhaps this // will be not necessary since by the time we can train 33-grams, we'll // all be using 64-bit machines. // // The following iters make it easy to navigate around in the above backoff graphs (BGs). // iterate accross a given level class LevelIter { const unsigned int numParents; const unsigned int numNodes; unsigned int state; const unsigned int level; public: LevelIter(const unsigned int _numParents,const unsigned int _level) : numParents(_numParents),numNodes(1<<_numParents),level(_level) { init(); } void init() { state = 0; } Boolean next(unsigned int&node); }; // iterate over parents of a BG node class BGParentIter { const unsigned int numParents; const unsigned int numNodes; unsigned int state; const unsigned int homeNode; const unsigned int numBitsSetOfHomeNode; public: BGParentIter(const unsigned int _numParents,const unsigned int _homeNode); void init() { state = (homeNode+1); } Boolean next(unsigned int&node); }; // iterate over (great) grandparents of a BG node class BGGrandParentIter { const unsigned int numParents; const unsigned int numNodes; unsigned int state; const unsigned int homeNode; const unsigned int numBitsSetOfHomeNode; const unsigned int great; // grandparent(great=0), greatgrandparent(great=1), etc. public: BGGrandParentIter(const unsigned int _numParents,const unsigned int _homeNode, const unsigned int _great=0); void init() { state = (homeNode+((1<<(great+1))-1)); } Boolean next(unsigned int&node); }; // iterate over all ancestors of a BG node class BGAncestorIter { const unsigned int numParents; const unsigned int numNodes; unsigned int state; const unsigned int homeNode; const unsigned int numBitsSetOfHomeNode; public: BGAncestorIter(const unsigned int _numParents,const unsigned int _homeNode); void init() { state = (homeNode+1); } Boolean next(unsigned int&node); }; // Child Iter, no constraints (i.e., all children) class BGChildIter { const unsigned int numParents; const unsigned int numNodes; int state; const unsigned int homeNode; const unsigned int numBitsSetOfHomeNode; public: BGChildIter(const unsigned int _numParents,const unsigned int _homeNode); void init() { state = ((int)homeNode-1); } Boolean next(unsigned int&node); }; // Child Iter with BO constraints class BGChildIterCnstr { const unsigned int numParents; const unsigned int numNodes; int state; const unsigned int homeNode; const unsigned int bo_constraints; const unsigned int numBitsSetOfHomeNode; public: BGChildIterCnstr(const unsigned int _numParents, const unsigned int _homeNode, const unsigned int _bo_constraints); void init() { state = ((int)homeNode-1); } Boolean next(unsigned int&node); }; // etc. class BGGrandChildIter { const unsigned int numParents; const unsigned int numNodes; int state; const unsigned int homeNode; const unsigned int numBitsSetOfHomeNode; const unsigned int great; public: BGGrandChildIter(const unsigned int _numParents,const unsigned int _homeNode, const unsigned int _great=0); void init() { state = ((int)homeNode-((1<<(great+1))-1)); } Boolean next(unsigned int&node); }; class BGDescendantIter { const unsigned int numParents; const unsigned int numNodes; int state; const unsigned int homeNode; const unsigned int numBitsSetOfHomeNode; public: BGDescendantIter(const unsigned int _numParents,const unsigned int _homeNode); void init() { state = ((int)homeNode-1); } Boolean next(unsigned int&node); }; // unsigned int numParents(const unsigned int _numParents,const unsigned int _homeNode); // unsigned int numChildren(const unsigned int _numParents,const unsigned int _homeNode); /* training data stats for this LM */ TextStats stats; char *countFileName; char *lmFileName; char *initLMFile; // intial LM file }; // Collection of conditional distributions that will be // computed. Array <FNgramSpec> fnSpecArray; // Hash that maps from tag name to array position where the tag // value will be stored when parsing a tagged text file. This // also gives the number of tags we're currently working with. LHash<VocabString,unsigned> tagPosition;public: void printFInfo(); // constructor FNgramSpecs(File& f, FactoredVocab& fv, unsigned debuglevel = 0); // TODO: finish destructor virtual ~FNgramSpecs() {} unsigned int loadWordFactors(const VocabString *words, WordMatrix& wm, unsigned int max); void estimateDiscounts(FactoredVocab& vocab); void computeCardinalityFunctions(FactoredVocab& vocab); // Boolean readCounts(); -- Not yet implemented static inline unsigned int numBitsSet(unsigned u) { unsigned count=0; while (u) { count += (u&0x1); u >>= 1; } return count; } static VocabString getTag(VocabString a); static VocabString wordTag();};#endif /* EXCLUDE_CONTRIB_END */#endif /* _FNgramSpecs_h_ */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -