📄 fngramlm.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 5 页
字号:
    /*     * Remove a ngram probabilities     */    for (unsigned i=0;i<fNgrams[specNum].parentSubsetsSize;i++) {      BOnode *node;      BOsIter iter(*this,specNum,i,context);      while (node = iter.next()) {	node->probs.clear(0);      }    }}voidFNgram::clear(){  for (unsigned i=0;i<fNgramsSize;i++)    clear(i);}// for printing binary stringsstatic char *intToBinStr(unsigned i){  // good for 64 bit ints  const int len = 64+3;  static char local_buff[len];  local_buff[len-1] = 0;  int pos = len-1;  if (i == 0)    local_buff[--pos] = '0';  else {    do {      if (i&0x1)	local_buff[--pos] = '1';      else 	local_buff[--pos] = '0';      i >>= 1;    } while (i);  }  local_buff[--pos] = 'b';  local_buff[--pos] = '0';  return &local_buff[pos];}// TODO: but this and other bit routines in one// separate pair of .cc .h files.unsigned int bitGather(register unsigned int mask,register unsigned int bitv){  // gather (or pack) together the bits in bitv that  // correspond to the 1 bits in mask, and place them  // at the low end of the word 'res'  register unsigned res = 0;  register unsigned res_pos = 0;  while (mask && bitv) {    if (mask & 0x1) {      if (bitv & 0x1) {	res |= (1<<res_pos);      }      res_pos++;    }    mask >>= 1;    bitv >>= 1;  }  return res;}/* * For a given node in the BG, compute the child node that * we should backoff to. The 'context' argument is assumed * to be in reverse order with respect to the cound tries. * * Note: This routine contains the backoff algorithm. * TODO: at some point make a version of this w/o a nWrtwCip argument *       for fast ppl and rescoring. */unsignedFNgram::boNode(const VocabIndex word,	       const VocabIndex *context,	       const unsigned nWrtwCip, // node number w.r.t which context is packed. 	       const unsigned int specNum,	       const unsigned int node){  assert (node != 0); // no reason to call this routine if node == 0.  // all nodes with one parent back off to the unigram.  const unsigned int nbitsSet = FNgramSpecsType::numBitsSet(node);  if (nbitsSet == 1)    return 0;  const unsigned int numParents = fngs.fnSpecArray[specNum].numParents;  FNgramSpecsType::FNgramSpec::BGChildIterCnstr    citer(numParents,node,fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint);  unsigned bg_child; // backoff-graph child  unsigned chosen_bg_child = ~0x0;  unsigned a_bg_child = ~0x0; // arbitrary child, if everything fails, we use this.  const Boolean domax =  // do min if domax == false.    (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine == MaxBgChild);  const double initScore = domax ? -1e220 : 1e220;  double bestScore = initScore;  // find child node with the largest counts  while (citer.next(bg_child)) {     // We max/min over the BO value. The BO value is determined by the    // BO algorithm (i.e., counts, normalized counts, etc.)  in the    // specs object for this node.    double score =      fngs.fnSpecArray[specNum].parentSubsets[bg_child].      backoffValueRSubCtxW(word,context,nWrtwCip,			   fngs.fnSpecArray[specNum].			   parentSubsets[node].backoffStrategy,			   *this,			   specNum,			   bg_child);			       if (score == -1e200) // TODO: change this to NaN or Inf, and a #define (also see below)      continue; // continue presumably because of a NULL counts object    if (a_bg_child == ~0x0)      a_bg_child = bg_child;    if (domax && (score > bestScore) || !domax && (score < bestScore)) {      chosen_bg_child = bg_child;      bestScore = score;    }  }  // make sure that we have at least one valid child  assert (a_bg_child != ~0x0);  // if we only have one child, or if we have two children and have  // not found a best child, just return an arbitrary child node.  if ((fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren == 1)      || (chosen_bg_child == ~0x0 && nbitsSet == 2))    return a_bg_child;  if (chosen_bg_child == ~0x0) {    // Then we did not found any BG-child with a score for this    // context. We back off to the child that has the best    // combined schore of its children. We keep    // doing this, as long as possible.     unsigned int great;    for (great=0; (great+2)< nbitsSet ; great++) {            citer.init();      while (citer.next(bg_child)) {	double score = initScore;	// get score for child	if (great == 0) {	  // children of child iter	  FNgramSpecsType::FNgramSpec::BGChildIterCnstr	    gciter(numParents,bg_child,fngs.fnSpecArray[specNum].parentSubsets[bg_child].backoffConstraint);	  unsigned bg_grandchild;	  while (gciter.next(bg_grandchild)) {	    double tmp=fngs.fnSpecArray[specNum].parentSubsets[bg_grandchild].	      backoffValueRSubCtxW(word,context,nWrtwCip,				   fngs.fnSpecArray[specNum].				   parentSubsets[node].backoffStrategy,				   *this,				   specNum,				   bg_grandchild);	    // compute local max min of offspring	    if (domax && (tmp > score) || !domax && (tmp < score)) {	      score = tmp;	    }	  }	} else {	  fprintf(stderr,"WARNING: might produce unnormalized LM. Check with -debug 3\n");	  // grandchildren of child iter	  FNgramSpecsType::FNgramSpec::BGGrandChildIter	    descendant_iter(numParents,bg_child,great-1);	  unsigned bg_grandchild;	  while (descendant_iter.next(bg_grandchild)) {	    double tmp=fngs.fnSpecArray[specNum].parentSubsets[bg_grandchild].	      backoffValueRSubCtxW(word,context,nWrtwCip,				   fngs.fnSpecArray[specNum].				   parentSubsets[node].backoffStrategy,				   *this,				   specNum,				   bg_grandchild);	    if (domax && (tmp > score) || !domax && (tmp < score)) {	      score = tmp;	    }	  }	}	if (score == -1e200)  // TODO: change this to NaN or Inf, and a #define (also see above)	  continue; // presumably because of a NULL counts objects	if (domax && (score > bestScore) || !domax && (score < bestScore)) {	  chosen_bg_child = bg_child;	  bestScore = score;	}      }      if (chosen_bg_child != ~0x0)	break;    }    // still not found, chose an arbitrary child    if (chosen_bg_child == ~0x0)      chosen_bg_child = a_bg_child;  }  return chosen_bg_child;}//// General algorithm for the "BG child probability backoff"LogPFNgram::bgChildProbBO(VocabIndex word, 		      const VocabIndex *context, 		      const unsigned nWrtwCip, 		      const unsigned int specNum,		      const unsigned int node){  // general graph backoff algorithm for multiple BG children.  // should never be called at node 0 since there is nothing to  // backoff to.  assert ( node != 0 );  // special case numBGChildren == 1 for speed.  if (fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren == 1) {    unsigned int bg_child;    FNgramSpecsType::FNgramSpec::BGChildIterCnstr      citer(fngs.fnSpecArray[specNum].numParents,	    node,	    fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint);    while (citer.next(bg_child)) {      if (fngs.fnSpecArray[specNum].parentSubsets[bg_child].counts != NULL) {	// return immediately since we've found the bg child	return wordProbBO(word,context,nWrtwCip,specNum,bg_child);      }    }  }  // still here? Do the general case.  LogP bo_prob;  if (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine      == ProdBgChild      ||      fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine      == GmeanBgChild) {    bo_prob = LogP_One;    unsigned int bg_child;    FNgramSpecsType::FNgramSpec::BGChildIterCnstr      citer(fngs.fnSpecArray[specNum].numParents,	    node,	    fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint);    while (citer.next(bg_child)) {      if (fngs.fnSpecArray[specNum].parentSubsets[bg_child].counts != NULL) {	// multiply the probs (add the log probs)	bo_prob += wordProbBO(word,context,nWrtwCip,specNum,bg_child);      }    }    if (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine	== GmeanBgChild)      bo_prob /= fngs.fnSpecArray[specNum].	parentSubsets[node].numBGChildren;  } else if (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine	     == SumBgChild ||	     fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine	     == AvgBgChild) {    bo_prob = LogP_Zero;    unsigned int bg_child;    FNgramSpecsType::FNgramSpec::BGChildIterCnstr      citer(fngs.fnSpecArray[specNum].numParents,	    node,	    fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint);    while (citer.next(bg_child)) {      if (fngs.fnSpecArray[specNum].parentSubsets[bg_child].counts != NULL) {	// add the probs	bo_prob = 	  (LogP)AddLogP(bo_prob,wordProbBO(word,context,nWrtwCip,specNum,bg_child));      }    }    if (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine	== AvgBgChild)      bo_prob -= (LogP)log10((double)fngs.fnSpecArray[specNum].parentSubsets[node].numBGChildren);  } else if (fngs.fnSpecArray[specNum].parentSubsets[node].backoffCombine	     == WmeanBgChild) {     bo_prob = LogP_Zero;    unsigned int bg_child;    FNgramSpecsType::FNgramSpec::BGChildIterCnstr      citer(fngs.fnSpecArray[specNum].numParents,	    node,	    fngs.fnSpecArray[specNum].parentSubsets[node].backoffConstraint);    unsigned cpos = 0;    while (citer.next(bg_child)) {      if (fngs.fnSpecArray[specNum].parentSubsets[bg_child].counts != NULL) {	// add the probs by weights	bo_prob = 	  (LogP)AddLogP(bo_prob, 		  // log10(0.5) +		  fngs.fnSpecArray[specNum].parentSubsets[node].wmean[cpos] +		  wordProbBO(word,context,nWrtwCip,specNum,bg_child));	cpos++;      }    }  } else {    // choose only one backoff node    unsigned chosen_descendant = boNode(word,context,nWrtwCip,specNum,node);    bo_prob = wordProbBO(word,context,nWrtwCip,specNum,chosen_descendant);  }  return bo_prob;}/* * This method implements the backoff algorithm in a straightforward, * recursive manner. * Note, this function likes context in reverse order. * I.e., if word = child = word_t, then *    context[0] = parent1 word_{t-1} *    context[1] = parent2 word_{t-2} *    context[2] = parent3 word_{t-3} * In the model p(child|parent1,parent2,parent3) = p(w_t|w_t-1,w_t-2,w_t-3) */// TODO: make a version of this routine that assumes context is// packed w.r.t. 0x1111 node, used for ppl and rescoring.LogPFNgram::wordProbBO(VocabIndex word, 		   const VocabIndex *context, 		   // nWrtwCip: Node number With respect to (w.r.t.) 		   // which Context is packed. 		   const unsigned nWrtwCip, 		   const unsigned int specNum,		   const unsigned int node){    LogP result;        const unsigned packedBits = bitGather(nWrtwCip,node);    LogP *prob =       fNgrams[specNum].parentSubsets[node].      findProbSubCtx(word,context,packedBits);    if (prob) {      if (running() && debug(DEBUG_NGRAM_HITS)) {	// note that this message will occur multiple times when doing	// certain strategies and/or combination methods. The true	// "hit" is the last (right most) one that is printed.	char buff[1024];	sprintf(buff,"[0x%X gram]",node);	dout() << buff;      }      result = *prob;    } else {      if (node == 0) {	if (running() && debug(DEBUG_NGRAM_HITS)) {	    dout() << "[OOV]";	}	result = LogP_Zero;      } else {	LogP *bow = 	  fNgrams[specNum].parentSubsets[node].	  findBOWSubCtx(context,packedBits);	if (bow) {	  // fprintf(stderr,"found bow = %f\n",*bow);	  result = *bow + 	    bgChildProbBO(word,context,nWrtwCip,specNum,node);	} else {	  if (!fngs.fnSpecArray[specNum].parentSubsets[node].requiresGenBackoff()) {	    result = bgChildProbBO(word,context,nWrtwCip,specNum,node);	  } else {	    // need to compute BOW for this context (i.e., normalize	    // the BO distribution).	    Prob sum = bgChildProbSum(context,nWrtwCip,specNum,node);	    LogP lsum = ProbToLogP(sum);	    // insert sum as a BOW to speed up access on future access.	    // NOTE: This next code is one of the reasons BG LM estimation runs so slowly.	    // TODO: could always recompute to save memory but slow things down.	    // TODO: this slows things down significantly, think of a way to make	    // this part run faster! (perhaps compute all bows during training,	    // but that makes the files large).	    // NOTE: this code will also cause memory usage to grow, it will	    // look like a memory leak, but it is not, it is just	    // the insertion of bows into the lm trie.	    *(fNgrams[specNum].parentSubsets[node].	      insertBOWSubCtx(context,packedBits)) = - lsum;	    result = bgChildProbBO(word,context,nWrtwCip,specNum,node)	      - lsum;	  }	}      }    }    return result;}LogPFNgram::wordProb(VocabIndex word, 		 const VocabIndex *context,		 // nWrtwCip: Node number With respect to (w.r.t.) which Context is packed. 		 const unsigned nWrtwCip, 		 const unsigned int specNum,		 const unsigned int node){  if (skipOOVs) {    /*     * Backward compatibility with the old broken perplexity code:     * TODO: figure out what is meant by broken here.     * return prob 0 if any of the context-words have an unknown     * word.     */    if (word == vocab.unkIndex())      return LogP_Zero;    unsigned int clen = Vocab::length(context);    for (unsigned j=0;j<clen; j++) {      if (context[j] == vocab.unkIndex())	return LogP_Zero;    }  }  /*   * Perform the backoff algorithm for a context lenth that is    * the minimum of what we were given and the maximal length of   * the contexts stored in the LM   */  return wordProbBO(word, context, nWrtwCip, specNum, node);}// reads in an ARPA-file inspired format -//   - there are node-grams rather than n-grams, where//     node is the bit vector (hex integer) giving LM parents that//     to be used in that gram in the *LM* graph//   - there might be multiple bows, depending on from where//     the backoff is coming from.// TODO: make sure that read sets all variables in FNgram that// constructor is supposed to do (e.g., "order", and other member variables).
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -