📄 fngramspecs.h
字号:
/* * FngramSpecs.h -- * * structure to store the specifications of a factored LM * * Jeff Bilmes <bilmes@ee.washington.edu> * * @(#)$Header: /home/srilm/devel/flm/src/RCS/FNgramSpecs.h,v 1.9 2006/01/09 18:30:49 stolcke Exp $ * */#ifndef _FNgramSpecs_h_#define _FNgramSpecs_h_#ifndef EXCLUDE_CONTRIB#include <iostream>using namespace std;#include <stdio.h>#include "LHash.h"#include "Trie.h"#include "Array.h"#include "LMStats.h"#include "FactoredVocab.h"/* * Debug levels used */#define DEBUG_TAG_WARNINGS 8#define DEBUG_VERY_VERBOSE 10#define DEBUG_EVERY_SENTENCE_INFO 12#define DEBUG_EXTREME 20#define DEBUG_WARN_DUP_TAG 1#define DEBUG_BG_PRINT 1#define DEBUG_MISSING_FIRST_LAST_WORD 1const unsigned int maxNumParentsPerChild = 32;const unsigned int maxExtraWordsPerLine = 3;// output files with these names are never writtenconst VocabString FNGRAM_DEV_NULL_FILE = "_";#define FNgramNode Trie<VocabIndex,CountT>template <class CountT> class FNgramCounts; // forward declarationclass FDiscount;class FNgram;class WidMatrix;class WordMatrix;enum BackoffNodeStrategy { CountsNoNorm=0, // use absolute counts CountsSumCountsNorm=1, // norm by sum counts at level (== event MI maximization) CountsSumNumWordsNorm=2, // norm by num words CountsProdCardinalityNorm=3, // norma by prod. cardinality of gram CountsSumCardinalityNorm=4, // norma by sum. cardinality of gram CountsSumLogCardinalityNorm=5, // norma by sum log cardinality of gram BogNodeProb=6 // backoff graph node probability // To add your favorite strategy here, add code 1) to boNode() in // FNgramLM.cc, 2) to backoffValueRSubCtxW(), and 3) to the node // option reading code in constructor for FNgramSpecs. Also make sure // to check the (great)grandchild score summing code in // FNgram::boNode() to make sure that it still makes sense with your // strategy.};// combination methodsenum BackoffNodeCombine { MaxBgChild=0, MinBgChild=1, SumBgChild=2, ProdBgChild=3, AvgBgChild=4, GmeanBgChild=5, WmeanBgChild=6};// useful defines.const char FNGRAM_WORD_TAG = 'W';const VocabString FNGRAM_WORD_TAG_STR = "W";const unsigned FNGRAM_WORD_TAG_STR_LEN = 2;// stuff will break if next const is changed to something other than 0const unsigned FNGRAM_WORD_TAG_POS = 0;const VocabString FNGRAM_WORD_TAG_SEP_STR = "-";const char FNGRAM_WORD_TAG_SEP = '-';const VocabString FNGRAM_WORD_TAG_NULL_SPEC_STR = Vocab_NULL;const char FNGRAM_FACTOR_SEPARATOR = ':';const VocabString FNGRAM_FACTOR_SEPARATOR_STR = ":";template <class CountT>class FNgramSpecs : public Debug { friend class FNgram; friend class FactoredVocab; FactoredVocab& fvocab; // the template&friend stuff does not seem to work for the gcc 2.95.3 // so we make everything public for now.public: // data structure for specifying which factored forms are to be // counted struct FNgramSpec { friend class FactoredVocab; FNgramSpec() { child = countFileName = NULL; numParents = 0; } VocabString child; unsigned childPosition; unsigned numParents; unsigned numSubSets; Array<VocabString> parents; Array<unsigned> parentPositions; Array<int> parentOffsets; unsigned int parseNodeString(char *str,Boolean& success); Boolean printNodeString(FILE* f,unsigned int node); // for each set of subsets of parents (all 2^N of them, where // N is the number of parents), (i.e., each node) we have the following // counts-like structure. struct ParentSubset { // 1. the "n-gram" for each parent subset has its own counts FNgramNode* counts; // Here, the "order" in the sence that P(C|P1,P2,...,PN) is // a distribution, then order = (N+1), so normal notion of N-gram order, // but for an arbitrary set of parents. order >= 1 unsigned int order; // Number of BG children, but based on the constraints (i.e., for // certain constraints, some BG children might not be used and // have counts == NULL). This must be >= 1 to have a well-defined // model. unsigned int numBGChildren; // Discount object for this node. FDiscount *discount; // options for this node, pretty much the same thing you'll find // on the command line of ngram-counts.cc unsigned int backoffConstraint; BackoffNodeStrategy backoffStrategy; BackoffNodeCombine backoffCombine; unsigned int gtmin; unsigned int gtmax; char *gtFile; double cdiscount; Boolean ndiscount; Boolean wbdiscount; Boolean kndiscount; Boolean ukndiscount; char *knFile; Boolean knCountsModified; Boolean knCountsModifyAtEnd; unsigned int knCountParent; Boolean interpolate; char *writeFile; // counts file for this node LogP2 *wmean; double prodCardinalities; double sumCardinalities; double sumLogCardinalities; Boolean requiresGenBackoff() { return !((numBGChildren <= 1) || (backoffCombine == AvgBgChild) || (backoffCombine == WmeanBgChild)); } // initialize to some sensible values, gt will be default. ParentSubset() { counts = NULL; order = 0; numBGChildren = 0; discount = NULL; backoffConstraint = ~0x0; backoffStrategy = CountsSumCountsNorm; backoffCombine = MaxBgChild; // TODO: make it possible for these defaults to be node dependent // like in ngram-counts.cc gtmin = 1; gtmax = 7; gtFile = 0; cdiscount = -1.0; ndiscount = false; wbdiscount = false; kndiscount = false; ukndiscount = false; knFile = NULL; knCountsModified = false; knCountsModifyAtEnd = false; knCountParent = ~0x0; interpolate = false; writeFile = NULL; wmean = NULL; }; /* * Individual word/ngram lookup and insertion, * caller should make sure counts != NULL. */ CountT *findCount(const VocabIndex *words) { return counts->find(words); }; CountT *findCount(const VocabIndex *words, VocabIndex word1) { FNgramNode *node = counts->findTrie(words); return node ? node->find(word1) : 0; } CountT *findCountR(const VocabIndex *words, VocabIndex word1); // Fast routines which are used given a BG-parent node context // but we wish to find the count of a child, given by the bit vector. // These routines should probably live in Trie.{cc,h}. CountT *findCountSubCtx(const VocabIndex *words, unsigned int bits = ~0x0); CountT *findCountSubCtxW(const VocabIndex *words, VocabIndex word1, unsigned int bits = ~0x0); // Same as above two, but indexes into words in the reverse order. This // is because LMs use wids in Tries in reverse order. These // routines give faster count trie lookups when wids are in reversed order // rather than having to reverse the wids list first. CountT *findCountRSubCtx(const VocabIndex *words, unsigned int bits = ~0x0); CountT *findCountRSubCtxW(const VocabIndex *words, VocabIndex word1, unsigned int bits = ~0x0); // return a value giving a "score" for the backoff context. // the caller might want to maximize that score to choose which context to use. double backoffValueRSubCtxW(VocabIndex word1, const VocabIndex*words, unsigned int nWrtwCip, BackoffNodeStrategy parentsStrategy, FNgram& fngram, unsigned int specNum, unsigned int node); // other routines for manipulating count objects. CountT *insertCount(const VocabIndex *words) { return counts->insert(words); }; CountT *insertCount(const VocabIndex *words, VocabIndex word1) { FNgramNode *node = counts->insertTrie(words); return node->insert(word1); }; CountT *removeCount(const VocabIndex *words) { return counts->remove(words); }; CountT *removeCount(const VocabIndex *words, VocabIndex word1) { FNgramNode *node = counts->findTrie(words); return node ? node->remove(word1) : 0; }; CountT accumulateCounts(FNgramNode* counts); CountT accumulateCounts() { return accumulateCounts(counts); } }; // iteration over counts entries in a parent subset object // Note that since count tries in this case only store counts for a given // level, we can get the "order" directly from the object itself. We // still give the option, however, to give a different order and iterate // over a context even though all of those counts will always be zero // (this is necessary in LM estimation) class PSIter { public: /* all ngrams of length order, starting * at root */ PSIter(ParentSubset& pss, VocabIndex *keys, unsigned order = ~0x0, int (*sort)(VocabIndex,VocabIndex) = 0) { if (order == ~0x0) order = pss.order; if (pss.counts == NULL) { myIter = NULL; } else { myIter = new TrieIter2<VocabIndex,CountT>(*(pss.counts), keys, order, sort); } } /* all count grams of length order for this parent subset, rooted * at node indexed by start */ PSIter(ParentSubset& pss, const VocabIndex *start, VocabIndex *keys, unsigned order = ~0x0, int (*sort)(VocabIndex, VocabIndex) = 0) { if (order == ~0x0) order = pss.order; if (pss.counts == NULL) { myIter = NULL; } else { myIter = new TrieIter2<VocabIndex,CountT>(*(pss.counts->insertTrie(start)), keys,order,sort); } } ~PSIter() { delete myIter; } void init() { if (myIter) myIter->init(); } CountT *next() { if (!myIter) return 0; FNgramNode *node = myIter->next(); return node ? &(node->value()) : 0; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -