📄 fngramspecs.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* * FNgramSpecs.cc -- *	Cross-stream N-gram specification file and paramter structure * * Jeff Bilmes <bilmes@ee.washington.edu> */#ifndef _FNgramSpecs_cc_#define _FNgramSpecs_cc_#ifndef lintstatic char FNgramSpecs_Copyright[] = "Copyright (c) 1995-2006 SRI International.  All Rights Reserved.";static char FNgramSpecs_RcsId[] = "@(#)$Header: /home/srilm/devel/flm/src/RCS/FNgramSpecs.cc,v 1.14 2006/01/05 20:21:27 stolcke Exp $";#endif#ifndef EXCLUDE_CONTRIB#include <iostream>using namespace std;#include <string.h>#include <ctype.h>#include "FNgramSpecs.h"#include "FNgram.h"#include "Trie.cc"#include "Array.cc"#include "FactoredVocab.h"#include "Debug.h"#include "FDiscount.h"#include "hexdec.h"#include "wmatrix.h"#define INSTANTIATE_FNGRAMSPECS(CountT) \	INSTANTIATE_ARRAY(FNgramSpecs<CountT>::FNgramSpec); \	INSTANTIATE_ARRAY(FNgramSpecs<CountT>::FNgramSpec::ParentSubset); \	template class FNgramSpecs<CountT>// from defined in FNgramLM.cc// TODO: place all bit routines in separate file.extern unsigned int bitGather(unsigned int mask,unsigned int bitv);/*static inline unsigned intnumBitsSet(register unsigned u) {  register unsigned count=0;  while (u) {    count += (u&0x1);    u >>= 1;  }  return count;}*/// John Henderson's extension to strtol with 0b prefix.static long strtolplusb(const char *nptr, char **endptr, int base){  const char *i;  long sign;  /* We should only try to be clever if the base 2 is specified, or no     base is specified, just like in the 0x case. */  if (base != 2 && base != 0)    return strtol(nptr,endptr,base);  i = nptr;  /* skip white space */  while (*i && isspace(*i))    i++;  /* decide what the sign should be */  sign = 1;  if (*i) {    if (*i == '+'){      i++;    } else if (*i == '-'){      sign = -1;      i++;    }  }  /* If we're not at the end, and we're "0b" or "0B", then return the result     base 2.  Let strtol do the work for us. */  if (*i && *i == '0' && *(i+1) && (*(i+1) == 'b' || *(i+1)=='B')       && *(i+2) && (*(i+2)=='1' || *(i+2)=='0'))    return sign*strtol(i+2,endptr,2);  else    /* otherwise use the given base */    return strtol(nptr,endptr,base);}/*static inline unsigned intbackoffGraphLevel(unsigned numParents, unsigned parentSubSet){  const unsigned int nbs = numBitsSet(parentSubSet);  assert (numParents >= nbs);  return (numParents-nbs);}*//* * return true if, given the mask, we should expand * the parentSubset path. *//*static inline BooleanexpandBackoff(unsigned mask, unsigned parentSubSet){  // true if all zero bits in parentSubSet have  // a 1 in corresponding position in mask  return ((~parentSubSet & mask) != 0);}*/template <class CountT> CountTFNgramSpecs<CountT>::FNgramSpec::ParentSubset::accumulateCounts(FNgramNode* counts){  if (counts == NULL)    return 0;  if (counts->numEntries() == 0)    return counts->value();  CountT accumulator = 0;  FNgramNode* child;  TrieIter<VocabIndex,CountT> iter(*counts);  VocabIndex wid;  while (child = iter.next(wid)) {    accumulator += accumulateCounts(child);  }  // not a leaf node, so destroy old accumulated counts  counts->value() = accumulator;  return accumulator;}//// The next set of routines do trie queries, but only use the words// for which the corresponding bit in 'bits' is on (in some way).// These sorts of routine should probably live in Trie.{h,cc} rather// than here.//template <class CountT> CountT*FNgramSpecs<CountT>::FNgramSpec::ParentSubset::findCountSubCtx(const VocabIndex*words,							       unsigned int bits){  // for word string "w,x,y,z" where "p3=w,p2=x,p1=y,c=z" in normal order,  // words is of the form  // variables:   p3 p2 p1 c     // bits:        b3 b2 b1 b0  // word[i],i=   0  1  2  3  // count tries have p3 root level, then p2, p1, and c at the leaf level.  // this routine indexes the count trie in ascending words order to go  // from p3 down to a leaf.  if (counts == NULL)    return NULL;  const unsigned nbs = numBitsSet(bits);  const unsigned wlen = Vocab::length(words);  assert (nbs <= wlen);  unsigned bitNum = wlen-1;  unsigned wNum = 0;  FNgramNode* subtrie = counts;  VocabIndex word[2];  word[1] = Vocab_None;  while (wNum < wlen) {    if (bits & (1<<bitNum)) {      word[0] = words[wNum];      if ((subtrie = subtrie->findTrie(word)) == NULL)	return NULL;    }    wNum++;    bitNum--;  }  return subtrie ? &(subtrie->value()) : NULL;}template <class CountT> CountT*FNgramSpecs<CountT>::FNgramSpec::ParentSubset::findCountSubCtxW(const VocabIndex*words,								VocabIndex word1,								unsigned int bits){  // same as findCountSubCtx(words,bits) above but we have an extra  // word1 variable that we query (irrespective of bits) at the end.  if (counts == NULL)    return NULL;  const unsigned nbs = numBitsSet(bits);  const unsigned wlen = Vocab::length(words);  assert (nbs <= wlen);  unsigned bitNum = wlen-1;  unsigned wNum = 0;  FNgramNode* subtrie = counts;  VocabIndex word[2];  word[1] = Vocab_None;  while (wNum < wlen) {    if (bits & (1<<bitNum)) {      word[0] = words[wNum];      if ((subtrie = subtrie->findTrie(word)) == NULL)	return NULL;    }    wNum++;    bitNum--;  }  return subtrie ? subtrie->find(word1) : NULL;}template <class CountT> CountT*FNgramSpecs<CountT>::FNgramSpec::ParentSubset::findCountRSubCtx(const VocabIndex*words,								unsigned int bits){  // same as findCountSubCtx except that words are in reversed order. Rather than  // reversing them again, this routine does the trie lookup in reverse word[] order of  // what findCountSubCtx does.  // words is of the form  // variables:   p3 p2 p1 c   // bits:        b3 b2 b1 b0  // word[i],i=   3  2  1  0  // In a model p(c|p1,p2,p3) where bits correspond to the parent number,  // count tries have p3 root trie level, then p2, p1, and c at the leaf level.  // this routine indexes the count trie in descending words order to go  // from p3 down to the root.  if (counts == NULL)    return NULL;  const unsigned wlen = Vocab::length(words);  assert (numBitsSet(bits) <= wlen);  int wNum = wlen-1;  FNgramNode* subtrie = counts;  VocabIndex word[2];  word[1] = Vocab_None;  while (wNum >= 0) {    if (bits & (1<<wNum)) {      word[0] = words[wNum];      if ((subtrie = subtrie->findTrie(word)) == NULL)	return NULL;    }    wNum--;  }  return subtrie ? &(subtrie->value()) : NULL;}template <class CountT> CountT*FNgramSpecs<CountT>::FNgramSpec::ParentSubset::findCountRSubCtxW(const VocabIndex*words,								 VocabIndex word1,								 unsigned int bits){  // same as findCountRSubCtx, but with a word1 variable.  if (counts == NULL)    return NULL;  const unsigned wlen = Vocab::length(words);  assert (numBitsSet(bits) <= wlen);  int wNum = wlen-1;  FNgramNode* subtrie = counts;  VocabIndex word[2];  word[1] = Vocab_None;  while (wNum >= 0) {    if (bits & (1<<wNum)) {      word[0] = words[wNum];      if ((subtrie = subtrie->findTrie(word)) == NULL)	return NULL;    }    wNum--;  }  return subtrie ? subtrie->find(word1) : NULL;}template <class CountT> CountT*FNgramSpecs<CountT>::FNgramSpec::ParentSubset::findCountR(const VocabIndex*words,							  VocabIndex word1){  // like findCount, but looks up words in reverse order.  if (counts == NULL)    return NULL;  const unsigned wlen = Vocab::length(words);  int wNum = wlen-1;  FNgramNode* subtrie = counts;  VocabIndex word[2];  word[1] = Vocab_None;  while (wNum >= 0) {    word[0] = words[wNum];    if ((subtrie = subtrie->findTrie(word)) == NULL)      return NULL;    wNum--;  }  return subtrie ? subtrie->find(word1) : NULL;}/*********************************************************************************** * * *                 The general backoff node selection strategy * * ***********************************************************************************/template <class CountT> doubleFNgramSpecs<CountT>::FNgramSpec::ParentSubset::backoffValueRSubCtxW(VocabIndex word1,       // the word		     const VocabIndex*words, // the context		     unsigned int nWrtwCip, // node w.r.t. which context is packed		     BackoffNodeStrategy parentsStrategy, // strategy of parent		     FNgram& fngram,        // the related fngram		     unsigned int specNum,  // which LM of the fngram		     unsigned int node)     // which node of the LM{  // we should never select a path with null counts  // this could occur because of backoff constraints  if (counts == NULL)    return -1e200; // TODO: return NaN or Inf to signal this condition (and use #define)    switch (parentsStrategy) {  case CountsNoNorm: {    const unsigned int bits = bitGather(nWrtwCip,node);    CountT *c = findCountRSubCtxW(words,word1,bits);    if (c)      return (double)*c;    else      return 0.0;  }    break;  case CountsSumCountsNorm: {    // choose the node with the max normalized counts,    // this is equivalent to choosing the largest    // maximum-likelihod (ML) prob (a max MI criterion).    // UGLYNESS: get the numerator and denominator here.    // just a copy of code from backoffValueRSubCtxW()     const unsigned int bits = bitGather(nWrtwCip,node);    const unsigned wlen = Vocab::length(words);    int wNum = wlen-1;    FNgramNode* subtrie = counts;    VocabIndex word[2];    word[1] = Vocab_None;    while (wNum >= 0) {      if (bits & (1<<wNum)) {	word[0] = words[wNum];	if ((subtrie = subtrie->findTrie(word)) == NULL)	  return 0.0;      }      wNum--;    }    if (!subtrie)      return 0.0;    CountT *cnt = subtrie->find(word1);    if (!cnt)      return 0.0;    if (*cnt == 0)      return 0.0;    // this might not be the case depending on if kndiscount is used    // since it significantly modifies the count tries.    // TODO: should indicate a warning here!!!    if (*cnt > subtrie->value()) {      fprintf(stderr,"Warning: leaf count greater than node sum, cnt = %d, sum = %d\n", 	      (unsigned)*cnt,	      (unsigned)subtrie->value());    }    // assert (*cnt <= subtrie->value());    double numerator = *cnt;    double denominator = subtrie->value();    // note that if sumCounts() wasn't called, denominator might be zero.    // we assume here that denominator can not be zero if *cnt > 0.    return numerator/denominator;  }    break;  case CountsSumNumWordsNorm: {    // normalize by the number of words that have occured.    // more UGLYNESS:    // again, a copy of code from backoffValueRSubCtxW()     const unsigned int bits = bitGather(nWrtwCip,node);    const unsigned wlen = Vocab::length(words);    int wNum = wlen-1;    FNgramNode* subtrie = counts;    VocabIndex word[2];    word[1] = Vocab_None;    while (wNum >= 0) {      if (bits & (1<<wNum)) {	word[0] = words[wNum];	if ((subtrie = subtrie->findTrie(word)) == NULL)	  return 0.0;      }      wNum--;    }    if (!subtrie)      return 0.0;    CountT *cnt = subtrie->find(word1);    if (!cnt)      return 0.0;    if (*cnt == 0)      return 0.0;    double denominator = subtrie->numEntries();    double numerator = *cnt;    // assume that denominator can't be zero if we got count for word1    return numerator/denominator;  }    break;  case CountsProdCardinalityNorm:  case CountsSumCardinalityNorm:  case CountsSumLogCardinalityNorm: {    const unsigned int bits = bitGather(nWrtwCip,node);    const unsigned wlen = Vocab::length(words);    int wNum = wlen-1;    FNgramNode* subtrie = counts;    VocabIndex word[2];    word[1] = Vocab_None;    while (wNum >= 0) {      if (bits & (1<<wNum)) {	word[0] = words[wNum];	if ((subtrie = subtrie->findTrie(word)) == NULL)	  return 0.0;      }      wNum--;    }    if (!subtrie)      return 0.0;    CountT *cnt = subtrie->find(word1);    if (!cnt)      return 0.0;    if (*cnt == 0)      return 0.0;    double numerator = *cnt;    double denominator;    if (parentsStrategy == CountsProdCardinalityNorm)       denominator = prodCardinalities;    else if (parentsStrategy == CountsSumLogCardinalityNorm)       denominator = sumCardinalities;    else       denominator = sumLogCardinalities;    return numerator/denominator;  }    break;  case BogNodeProb: {    // chose the one with the maximum smoothed probability    // this is also a max MI criterion.    return fngram.wordProbBO(word1,words,nWrtwCip,specNum,node);  }  default:    fprintf(stderr,"Error: unknown backoff strategy, value = %d\n",parentsStrategy);    abort();    break;  }  return -1.0;}  // of backoffValueRSubCtxWtemplate <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::LevelIter::next(unsigned int&node) {  for (;state<numNodes;state++) {    // if (numBitsSet(state) == (numParents - level)) {    if (numBitsSet(state) == level) {      node = state++;      return true;
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -