📄 fngramspecs.cc
字号:
/* * FNgramSpecs.cc -- * Cross-stream N-gram specification file and paramter structure * * Jeff Bilmes <bilmes@ee.washington.edu> */#ifndef _FNgramSpecs_cc_#define _FNgramSpecs_cc_#ifndef lintstatic char FNgramSpecs_Copyright[] = "Copyright (c) 1995-2006 SRI International. All Rights Reserved.";static char FNgramSpecs_RcsId[] = "@(#)$Header: /home/srilm/devel/flm/src/RCS/FNgramSpecs.cc,v 1.14 2006/01/05 20:21:27 stolcke Exp $";#endif#ifndef EXCLUDE_CONTRIB#include <iostream>using namespace std;#include <string.h>#include <ctype.h>#include "FNgramSpecs.h"#include "FNgram.h"#include "Trie.cc"#include "Array.cc"#include "FactoredVocab.h"#include "Debug.h"#include "FDiscount.h"#include "hexdec.h"#include "wmatrix.h"#define INSTANTIATE_FNGRAMSPECS(CountT) \ INSTANTIATE_ARRAY(FNgramSpecs<CountT>::FNgramSpec); \ INSTANTIATE_ARRAY(FNgramSpecs<CountT>::FNgramSpec::ParentSubset); \ template class FNgramSpecs<CountT>// from defined in FNgramLM.cc// TODO: place all bit routines in separate file.extern unsigned int bitGather(unsigned int mask,unsigned int bitv);/*static inline unsigned intnumBitsSet(register unsigned u) { register unsigned count=0; while (u) { count += (u&0x1); u >>= 1; } return count;}*/// John Henderson's extension to strtol with 0b prefix.static long strtolplusb(const char *nptr, char **endptr, int base){ const char *i; long sign; /* We should only try to be clever if the base 2 is specified, or no base is specified, just like in the 0x case. */ if (base != 2 && base != 0) return strtol(nptr,endptr,base); i = nptr; /* skip white space */ while (*i && isspace(*i)) i++; /* decide what the sign should be */ sign = 1; if (*i) { if (*i == '+'){ i++; } else if (*i == '-'){ sign = -1; i++; } } /* If we're not at the end, and we're "0b" or "0B", then return the result base 2. Let strtol do the work for us. */ if (*i && *i == '0' && *(i+1) && (*(i+1) == 'b' || *(i+1)=='B') && *(i+2) && (*(i+2)=='1' || *(i+2)=='0')) return sign*strtol(i+2,endptr,2); else /* otherwise use the given base */ return strtol(nptr,endptr,base);}/*static inline unsigned intbackoffGraphLevel(unsigned numParents, unsigned parentSubSet){ const unsigned int nbs = numBitsSet(parentSubSet); assert (numParents >= nbs); return (numParents-nbs);}*//* * return true if, given the mask, we should expand * the parentSubset path. *//*static inline BooleanexpandBackoff(unsigned mask, unsigned parentSubSet){ // true if all zero bits in parentSubSet have // a 1 in corresponding position in mask return ((~parentSubSet & mask) != 0);}*/template <class CountT> CountTFNgramSpecs<CountT>::FNgramSpec::ParentSubset::accumulateCounts(FNgramNode* counts){ if (counts == NULL) return 0; if (counts->numEntries() == 0) return counts->value(); CountT accumulator = 0; FNgramNode* child; TrieIter<VocabIndex,CountT> iter(*counts); VocabIndex wid; while (child = iter.next(wid)) { accumulator += accumulateCounts(child); } // not a leaf node, so destroy old accumulated counts counts->value() = accumulator; return accumulator;}//// The next set of routines do trie queries, but only use the words// for which the corresponding bit in 'bits' is on (in some way).// These sorts of routine should probably live in Trie.{h,cc} rather// than here.//template <class CountT> CountT*FNgramSpecs<CountT>::FNgramSpec::ParentSubset::findCountSubCtx(const VocabIndex*words, unsigned int bits){ // for word string "w,x,y,z" where "p3=w,p2=x,p1=y,c=z" in normal order, // words is of the form // variables: p3 p2 p1 c // bits: b3 b2 b1 b0 // word[i],i= 0 1 2 3 // count tries have p3 root level, then p2, p1, and c at the leaf level. // this routine indexes the count trie in ascending words order to go // from p3 down to a leaf. if (counts == NULL) return NULL; const unsigned nbs = numBitsSet(bits); const unsigned wlen = Vocab::length(words); assert (nbs <= wlen); unsigned bitNum = wlen-1; unsigned wNum = 0; FNgramNode* subtrie = counts; VocabIndex word[2]; word[1] = Vocab_None; while (wNum < wlen) { if (bits & (1<<bitNum)) { word[0] = words[wNum]; if ((subtrie = subtrie->findTrie(word)) == NULL) return NULL; } wNum++; bitNum--; } return subtrie ? &(subtrie->value()) : NULL;}template <class CountT> CountT*FNgramSpecs<CountT>::FNgramSpec::ParentSubset::findCountSubCtxW(const VocabIndex*words, VocabIndex word1, unsigned int bits){ // same as findCountSubCtx(words,bits) above but we have an extra // word1 variable that we query (irrespective of bits) at the end. if (counts == NULL) return NULL; const unsigned nbs = numBitsSet(bits); const unsigned wlen = Vocab::length(words); assert (nbs <= wlen); unsigned bitNum = wlen-1; unsigned wNum = 0; FNgramNode* subtrie = counts; VocabIndex word[2]; word[1] = Vocab_None; while (wNum < wlen) { if (bits & (1<<bitNum)) { word[0] = words[wNum]; if ((subtrie = subtrie->findTrie(word)) == NULL) return NULL; } wNum++; bitNum--; } return subtrie ? subtrie->find(word1) : NULL;}template <class CountT> CountT*FNgramSpecs<CountT>::FNgramSpec::ParentSubset::findCountRSubCtx(const VocabIndex*words, unsigned int bits){ // same as findCountSubCtx except that words are in reversed order. Rather than // reversing them again, this routine does the trie lookup in reverse word[] order of // what findCountSubCtx does. // words is of the form // variables: p3 p2 p1 c // bits: b3 b2 b1 b0 // word[i],i= 3 2 1 0 // In a model p(c|p1,p2,p3) where bits correspond to the parent number, // count tries have p3 root trie level, then p2, p1, and c at the leaf level. // this routine indexes the count trie in descending words order to go // from p3 down to the root. if (counts == NULL) return NULL; const unsigned wlen = Vocab::length(words); assert (numBitsSet(bits) <= wlen); int wNum = wlen-1; FNgramNode* subtrie = counts; VocabIndex word[2]; word[1] = Vocab_None; while (wNum >= 0) { if (bits & (1<<wNum)) { word[0] = words[wNum]; if ((subtrie = subtrie->findTrie(word)) == NULL) return NULL; } wNum--; } return subtrie ? &(subtrie->value()) : NULL;}template <class CountT> CountT*FNgramSpecs<CountT>::FNgramSpec::ParentSubset::findCountRSubCtxW(const VocabIndex*words, VocabIndex word1, unsigned int bits){ // same as findCountRSubCtx, but with a word1 variable. if (counts == NULL) return NULL; const unsigned wlen = Vocab::length(words); assert (numBitsSet(bits) <= wlen); int wNum = wlen-1; FNgramNode* subtrie = counts; VocabIndex word[2]; word[1] = Vocab_None; while (wNum >= 0) { if (bits & (1<<wNum)) { word[0] = words[wNum]; if ((subtrie = subtrie->findTrie(word)) == NULL) return NULL; } wNum--; } return subtrie ? subtrie->find(word1) : NULL;}template <class CountT> CountT*FNgramSpecs<CountT>::FNgramSpec::ParentSubset::findCountR(const VocabIndex*words, VocabIndex word1){ // like findCount, but looks up words in reverse order. if (counts == NULL) return NULL; const unsigned wlen = Vocab::length(words); int wNum = wlen-1; FNgramNode* subtrie = counts; VocabIndex word[2]; word[1] = Vocab_None; while (wNum >= 0) { word[0] = words[wNum]; if ((subtrie = subtrie->findTrie(word)) == NULL) return NULL; wNum--; } return subtrie ? subtrie->find(word1) : NULL;}/*********************************************************************************** * * * The general backoff node selection strategy * * ***********************************************************************************/template <class CountT> doubleFNgramSpecs<CountT>::FNgramSpec::ParentSubset::backoffValueRSubCtxW(VocabIndex word1, // the word const VocabIndex*words, // the context unsigned int nWrtwCip, // node w.r.t. which context is packed BackoffNodeStrategy parentsStrategy, // strategy of parent FNgram& fngram, // the related fngram unsigned int specNum, // which LM of the fngram unsigned int node) // which node of the LM{ // we should never select a path with null counts // this could occur because of backoff constraints if (counts == NULL) return -1e200; // TODO: return NaN or Inf to signal this condition (and use #define) switch (parentsStrategy) { case CountsNoNorm: { const unsigned int bits = bitGather(nWrtwCip,node); CountT *c = findCountRSubCtxW(words,word1,bits); if (c) return (double)*c; else return 0.0; } break; case CountsSumCountsNorm: { // choose the node with the max normalized counts, // this is equivalent to choosing the largest // maximum-likelihod (ML) prob (a max MI criterion). // UGLYNESS: get the numerator and denominator here. // just a copy of code from backoffValueRSubCtxW() const unsigned int bits = bitGather(nWrtwCip,node); const unsigned wlen = Vocab::length(words); int wNum = wlen-1; FNgramNode* subtrie = counts; VocabIndex word[2]; word[1] = Vocab_None; while (wNum >= 0) { if (bits & (1<<wNum)) { word[0] = words[wNum]; if ((subtrie = subtrie->findTrie(word)) == NULL) return 0.0; } wNum--; } if (!subtrie) return 0.0; CountT *cnt = subtrie->find(word1); if (!cnt) return 0.0; if (*cnt == 0) return 0.0; // this might not be the case depending on if kndiscount is used // since it significantly modifies the count tries. // TODO: should indicate a warning here!!! if (*cnt > subtrie->value()) { fprintf(stderr,"Warning: leaf count greater than node sum, cnt = %d, sum = %d\n", (unsigned)*cnt, (unsigned)subtrie->value()); } // assert (*cnt <= subtrie->value()); double numerator = *cnt; double denominator = subtrie->value(); // note that if sumCounts() wasn't called, denominator might be zero. // we assume here that denominator can not be zero if *cnt > 0. return numerator/denominator; } break; case CountsSumNumWordsNorm: { // normalize by the number of words that have occured. // more UGLYNESS: // again, a copy of code from backoffValueRSubCtxW() const unsigned int bits = bitGather(nWrtwCip,node); const unsigned wlen = Vocab::length(words); int wNum = wlen-1; FNgramNode* subtrie = counts; VocabIndex word[2]; word[1] = Vocab_None; while (wNum >= 0) { if (bits & (1<<wNum)) { word[0] = words[wNum]; if ((subtrie = subtrie->findTrie(word)) == NULL) return 0.0; } wNum--; } if (!subtrie) return 0.0; CountT *cnt = subtrie->find(word1); if (!cnt) return 0.0; if (*cnt == 0) return 0.0; double denominator = subtrie->numEntries(); double numerator = *cnt; // assume that denominator can't be zero if we got count for word1 return numerator/denominator; } break; case CountsProdCardinalityNorm: case CountsSumCardinalityNorm: case CountsSumLogCardinalityNorm: { const unsigned int bits = bitGather(nWrtwCip,node); const unsigned wlen = Vocab::length(words); int wNum = wlen-1; FNgramNode* subtrie = counts; VocabIndex word[2]; word[1] = Vocab_None; while (wNum >= 0) { if (bits & (1<<wNum)) { word[0] = words[wNum]; if ((subtrie = subtrie->findTrie(word)) == NULL) return 0.0; } wNum--; } if (!subtrie) return 0.0; CountT *cnt = subtrie->find(word1); if (!cnt) return 0.0; if (*cnt == 0) return 0.0; double numerator = *cnt; double denominator; if (parentsStrategy == CountsProdCardinalityNorm) denominator = prodCardinalities; else if (parentsStrategy == CountsSumLogCardinalityNorm) denominator = sumCardinalities; else denominator = sumLogCardinalities; return numerator/denominator; } break; case BogNodeProb: { // chose the one with the maximum smoothed probability // this is also a max MI criterion. return fngram.wordProbBO(word1,words,nWrtwCip,specNum,node); } default: fprintf(stderr,"Error: unknown backoff strategy, value = %d\n",parentsStrategy); abort(); break; } return -1.0;} // of backoffValueRSubCtxWtemplate <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::LevelIter::next(unsigned int&node) { for (;state<numNodes;state++) { // if (numBitsSet(state) == (numParents - level)) { if (numBitsSet(state) == level) { node = state++; return true;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -